aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/afs/dir.c4
-rw-r--r--fs/afs/file.c5
-rw-r--r--fs/afs/fs_probe.c41
-rw-r--r--fs/afs/inode.c18
-rw-r--r--fs/afs/internal.h9
-rw-r--r--fs/afs/misc.c52
-rw-r--r--fs/afs/rotate.c53
-rw-r--r--fs/afs/vl_probe.c47
-rw-r--r--fs/afs/vl_rotate.c50
-rw-r--r--fs/aio.c294
-rw-r--r--fs/autofs/autofs_i.h13
-rw-r--r--fs/autofs/dev-ioctl.c27
-rw-r--r--fs/autofs/init.c2
-rw-r--r--fs/autofs/inode.c67
-rw-r--r--fs/autofs/root.c16
-rw-r--r--fs/autofs/waitq.c10
-rw-r--r--fs/bfs/bfs.h11
-rw-r--r--fs/bfs/dir.c4
-rw-r--r--fs/bfs/file.c2
-rw-r--r--fs/bfs/inode.c65
-rw-r--r--fs/binfmt_aout.c4
-rw-r--r--fs/binfmt_script.c10
-rw-r--r--fs/block_dev.c44
-rw-r--r--fs/btrfs/backref.c13
-rw-r--r--fs/btrfs/btrfs_inode.h14
-rw-r--r--fs/btrfs/check-integrity.c24
-rw-r--r--fs/btrfs/compression.c26
-rw-r--r--fs/btrfs/ctree.c46
-rw-r--r--fs/btrfs/ctree.h267
-rw-r--r--fs/btrfs/delayed-ref.c61
-rw-r--r--fs/btrfs/delayed-ref.h3
-rw-r--r--fs/btrfs/dev-replace.c191
-rw-r--r--fs/btrfs/dev-replace.h8
-rw-r--r--fs/btrfs/disk-io.c128
-rw-r--r--fs/btrfs/disk-io.h10
-rw-r--r--fs/btrfs/extent-tree.c1254
-rw-r--r--fs/btrfs/extent_io.c413
-rw-r--r--fs/btrfs/extent_io.h66
-rw-r--r--fs/btrfs/extent_map.c3
-rw-r--r--fs/btrfs/extent_map.h21
-rw-r--r--fs/btrfs/file-item.c13
-rw-r--r--fs/btrfs/file.c53
-rw-r--r--fs/btrfs/free-space-tree.c15
-rw-r--r--fs/btrfs/inode.c665
-rw-r--r--fs/btrfs/ioctl.c643
-rw-r--r--fs/btrfs/lzo.c2
-rw-r--r--fs/btrfs/ordered-data.c30
-rw-r--r--fs/btrfs/ordered-data.h47
-rw-r--r--fs/btrfs/qgroup.c38
-rw-r--r--fs/btrfs/qgroup.h6
-rw-r--r--fs/btrfs/raid56.c2
-rw-r--r--fs/btrfs/reada.c16
-rw-r--r--fs/btrfs/ref-verify.c6
-rw-r--r--fs/btrfs/relocation.c51
-rw-r--r--fs/btrfs/scrub.c85
-rw-r--r--fs/btrfs/send.c19
-rw-r--r--fs/btrfs/super.c93
-rw-r--r--fs/btrfs/sysfs.c14
-rw-r--r--fs/btrfs/sysfs.h2
-rw-r--r--fs/btrfs/tests/btrfs-tests.c4
-rw-r--r--fs/btrfs/tests/extent-io-tests.c29
-rw-r--r--fs/btrfs/tests/inode-tests.c6
-rw-r--r--fs/btrfs/transaction.c93
-rw-r--r--fs/btrfs/transaction.h16
-rw-r--r--fs/btrfs/tree-checker.c14
-rw-r--r--fs/btrfs/tree-log.c44
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/volumes.c779
-rw-r--r--fs/btrfs/volumes.h25
-rw-r--r--fs/btrfs/xattr.c8
-rw-r--r--fs/buffer.c12
-rw-r--r--fs/cachefiles/namei.c8
-rw-r--r--fs/cachefiles/rdwr.c9
-rw-r--r--fs/cachefiles/xattr.c3
-rw-r--r--fs/ceph/addr.c5
-rw-r--r--fs/ceph/caps.c75
-rw-r--r--fs/ceph/inode.c60
-rw-r--r--fs/ceph/mds_client.c129
-rw-r--r--fs/ceph/mds_client.h16
-rw-r--r--fs/ceph/mdsmap.c1
-rw-r--r--fs/ceph/super.c4
-rw-r--r--fs/ceph/super.h6
-rw-r--r--fs/cifs/Kconfig7
-rw-r--r--fs/cifs/Makefile2
-rw-r--r--fs/cifs/cifs_debug.c12
-rw-r--r--fs/cifs/cifs_dfs_ref.c138
-rw-r--r--fs/cifs/cifs_fs_sb.h9
-rw-r--r--fs/cifs/cifsencrypt.c13
-rw-r--r--fs/cifs/cifsfs.c17
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h15
-rw-r--r--fs/cifs/cifsproto.h28
-rw-r--r--fs/cifs/cifssmb.c88
-rw-r--r--fs/cifs/connect.c924
-rw-r--r--fs/cifs/dfs_cache.c1367
-rw-r--r--fs/cifs/dfs_cache.h97
-rw-r--r--fs/cifs/dir.c2
-rw-r--r--fs/cifs/file.c50
-rw-r--r--fs/cifs/inode.c44
-rw-r--r--fs/cifs/misc.c68
-rw-r--r--fs/cifs/readdir.c9
-rw-r--r--fs/cifs/sess.c4
-rw-r--r--fs/cifs/smb1ops.c15
-rw-r--r--fs/cifs/smb2inode.c18
-rw-r--r--fs/cifs/smb2maperror.c4
-rw-r--r--fs/cifs/smb2ops.c317
-rw-r--r--fs/cifs/smb2pdu.c148
-rw-r--r--fs/cifs/smb2pdu.h3
-rw-r--r--fs/cifs/smb2proto.h12
-rw-r--r--fs/cifs/smbdirect.c2
-rw-r--r--fs/cifs/transport.c8
-rw-r--r--fs/dax.c61
-rw-r--r--fs/direct-io.c8
-rw-r--r--fs/dlm/ast.c10
-rw-r--r--fs/dlm/lock.c17
-rw-r--r--fs/dlm/lockspace.c9
-rw-r--r--fs/dlm/member.c7
-rw-r--r--fs/dlm/memory.c9
-rw-r--r--fs/dlm/user.c5
-rw-r--r--fs/eventpoll.c272
-rw-r--r--fs/exec.c116
-rw-r--r--fs/exofs/super.c37
-rw-r--r--fs/exportfs/expfs.c3
-rw-r--r--fs/ext2/super.c13
-rw-r--r--fs/ext2/xattr.c5
-rw-r--r--fs/ext4/page-io.c2
-rw-r--r--fs/ext4/readpage.c2
-rw-r--r--fs/f2fs/acl.c20
-rw-r--r--fs/f2fs/checkpoint.c33
-rw-r--r--fs/f2fs/data.c162
-rw-r--r--fs/f2fs/debug.c25
-rw-r--r--fs/f2fs/dir.c20
-rw-r--r--fs/f2fs/f2fs.h98
-rw-r--r--fs/f2fs/file.c42
-rw-r--r--fs/f2fs/gc.c81
-rw-r--r--fs/f2fs/inline.c20
-rw-r--r--fs/f2fs/inode.c22
-rw-r--r--fs/f2fs/namei.c8
-rw-r--r--fs/f2fs/node.c40
-rw-r--r--fs/f2fs/node.h2
-rw-r--r--fs/f2fs/recovery.c4
-rw-r--r--fs/f2fs/segment.c101
-rw-r--r--fs/f2fs/segment.h2
-rw-r--r--fs/f2fs/shrinker.c2
-rw-r--r--fs/f2fs/super.c170
-rw-r--r--fs/f2fs/sysfs.c27
-rw-r--r--fs/f2fs/xattr.c22
-rw-r--r--fs/fat/cache.c2
-rw-r--r--fs/fat/dir.c8
-rw-r--r--fs/fat/fat.h30
-rw-r--r--fs/fat/fatent.c16
-rw-r--r--fs/fat/inode.c26
-rw-r--r--fs/fat/misc.c2
-rw-r--r--fs/file.c31
-rw-r--r--fs/file_table.c7
-rw-r--r--fs/fscache/object.c3
-rw-r--r--fs/fuse/dir.c26
-rw-r--r--fs/fuse/file.c64
-rw-r--r--fs/fuse/fuse_i.h4
-rw-r--r--fs/fuse/inode.c5
-rw-r--r--fs/gfs2/aops.c16
-rw-r--r--fs/gfs2/bmap.c10
-rw-r--r--fs/gfs2/file.c10
-rw-r--r--fs/gfs2/glock.c2
-rw-r--r--fs/gfs2/glock.h2
-rw-r--r--fs/gfs2/glops.c17
-rw-r--r--fs/gfs2/incore.h3
-rw-r--r--fs/gfs2/inode.c18
-rw-r--r--fs/gfs2/inode.h10
-rw-r--r--fs/gfs2/log.c5
-rw-r--r--fs/gfs2/log.h5
-rw-r--r--fs/gfs2/lops.c257
-rw-r--r--fs/gfs2/lops.h4
-rw-r--r--fs/gfs2/ops_fstype.c1
-rw-r--r--fs/gfs2/recovery.c178
-rw-r--r--fs/gfs2/recovery.h5
-rw-r--r--fs/gfs2/rgrp.c4
-rw-r--r--fs/gfs2/rgrp.h2
-rw-r--r--fs/gfs2/super.c1
-rw-r--r--fs/gfs2/trans.c8
-rw-r--r--fs/hfs/btree.c3
-rw-r--r--fs/hfsplus/btree.c3
-rw-r--r--fs/hfsplus/dir.c1
-rw-r--r--fs/hfsplus/hfsplus_fs.h2
-rw-r--r--fs/hfsplus/inode.c21
-rw-r--r--fs/hugetlbfs/inode.c61
-rw-r--r--fs/inode.c4
-rw-r--r--fs/ioctl.c2
-rw-r--r--fs/iomap.c41
-rw-r--r--fs/jffs2/super.c3
-rw-r--r--fs/kernfs/file.c23
-rw-r--r--fs/lockd/clnt4xdr.c22
-rw-r--r--fs/lockd/clntproc.c8
-rw-r--r--fs/lockd/clntxdr.c22
-rw-r--r--fs/lockd/svclock.c2
-rw-r--r--fs/lockd/xdr.c4
-rw-r--r--fs/lockd/xdr4.c4
-rw-r--r--fs/locks.c344
-rw-r--r--fs/namei.c3
-rw-r--r--fs/namespace.c160
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/nfs/callback.c10
-rw-r--r--fs/nfs/client.c9
-rw-r--r--fs/nfs/delegation.c28
-rw-r--r--fs/nfs/delegation.h10
-rw-r--r--fs/nfs/dir.c59
-rw-r--r--fs/nfs/direct.c9
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c70
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h8
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c16
-rw-r--r--fs/nfs/inode.c13
-rw-r--r--fs/nfs/internal.h17
-rw-r--r--fs/nfs/nfs3proc.c4
-rw-r--r--fs/nfs/nfs4_fs.h68
-rw-r--r--fs/nfs/nfs4client.c4
-rw-r--r--fs/nfs/nfs4proc.c164
-rw-r--r--fs/nfs/nfs4renewd.c9
-rw-r--r--fs/nfs/nfs4session.c5
-rw-r--r--fs/nfs/nfs4state.c131
-rw-r--r--fs/nfs/nfs4trace.h456
-rw-r--r--fs/nfs/pagelist.c4
-rw-r--r--fs/nfs/pnfs.c14
-rw-r--r--fs/nfs/pnfs.h10
-rw-r--r--fs/nfs/pnfs_dev.c4
-rw-r--r--fs/nfs/pnfs_nfs.c2
-rw-r--r--fs/nfs/proc.c2
-rw-r--r--fs/nfs/super.c47
-rw-r--r--fs/nfs/unlink.c20
-rw-r--r--fs/nfs/write.c26
-rw-r--r--fs/nfsd/nfs4callback.c31
-rw-r--r--fs/nfsd/nfs4layouts.c1
-rw-r--r--fs/nfsd/nfs4proc.c15
-rw-r--r--fs/nfsd/nfs4recover.c17
-rw-r--r--fs/nfsd/nfs4state.c16
-rw-r--r--fs/nfsd/nfscache.c2
-rw-r--r--fs/nfsd/nfsctl.c2
-rw-r--r--fs/nfsd/state.h2
-rw-r--r--fs/nfsd/vfs.c17
-rw-r--r--fs/notify/fanotify/fanotify.c32
-rw-r--r--fs/notify/fanotify/fanotify_user.c12
-rw-r--r--fs/notify/fdinfo.c1
-rw-r--r--fs/notify/fsnotify.c2
-rw-r--r--fs/ntfs/malloc.h2
-rw-r--r--fs/ocfs2/Makefile2
-rw-r--r--fs/ocfs2/aops.c3
-rw-r--r--fs/ocfs2/buffer_head_io.c2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c17
-rw-r--r--fs/ocfs2/dlm/Makefile2
-rw-r--r--fs/ocfs2/dlmfs/Makefile2
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c7
-rw-r--r--fs/ocfs2/export.c2
-rw-r--r--fs/ocfs2/journal.c6
-rw-r--r--fs/ocfs2/localalloc.c12
-rw-r--r--fs/ocfs2/locks.c10
-rw-r--r--fs/ocfs2/move_extents.c47
-rw-r--r--fs/openpromfs/inode.c11
-rw-r--r--fs/orangefs/inode.c2
-rw-r--r--fs/orangefs/orangefs-bufmap.c2
-rw-r--r--fs/overlayfs/dir.c14
-rw-r--r--fs/overlayfs/export.c6
-rw-r--r--fs/overlayfs/inode.c17
-rw-r--r--fs/pnode.c1
-rw-r--r--fs/proc/array.c10
-rw-r--r--fs/proc/base.c20
-rw-r--r--fs/proc/inode.c4
-rw-r--r--fs/proc/page.c2
-rw-r--r--fs/proc/proc_sysctl.c13
-rw-r--r--fs/proc/task_mmu.c9
-rw-r--r--fs/proc/util.c1
-rw-r--r--fs/pstore/ftrace.c2
-rw-r--r--fs/pstore/inode.c51
-rw-r--r--fs/pstore/platform.c173
-rw-r--r--fs/pstore/pmsg.c2
-rw-r--r--fs/pstore/ram.c78
-rw-r--r--fs/pstore/ram_core.c47
-rw-r--r--fs/quota/quota.c3
-rw-r--r--fs/read_write.c15
-rw-r--r--fs/readdir.c10
-rw-r--r--fs/select.c367
-rw-r--r--fs/splice.c7
-rw-r--r--fs/super.c24
-rw-r--r--fs/sysfs/file.c4
-rw-r--r--fs/sysv/inode.c2
-rw-r--r--fs/ubifs/Kconfig16
-rw-r--r--fs/ubifs/auth.c5
-rw-r--r--fs/ubifs/file.c2
-rw-r--r--fs/ubifs/lpt.c12
-rw-r--r--fs/ubifs/replay.c72
-rw-r--r--fs/ubifs/sb.c13
-rw-r--r--fs/udf/inode.c6
-rw-r--r--fs/udf/super.c16
-rw-r--r--fs/udf/unicode.c14
-rw-r--r--fs/userfaultfd.c39
-rw-r--r--fs/xfs/libxfs/xfs_ag.c9
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c79
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h4
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c6
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h4
-rw-r--r--fs/xfs/libxfs/xfs_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_defer.c67
-rw-r--r--fs/xfs/libxfs/xfs_defer.h37
-rw-r--r--fs/xfs/libxfs/xfs_format.h12
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c54
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c7
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c6
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c240
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h54
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c6
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c14
-rw-r--r--fs/xfs/libxfs/xfs_types.c9
-rw-r--r--fs/xfs/libxfs/xfs_types.h22
-rw-r--r--fs/xfs/scrub/agheader.c25
-rw-r--r--fs/xfs/scrub/agheader_repair.c5
-rw-r--r--fs/xfs/scrub/alloc.c4
-rw-r--r--fs/xfs/scrub/btree.c45
-rw-r--r--fs/xfs/scrub/btree.h22
-rw-r--r--fs/xfs/scrub/common.c14
-rw-r--r--fs/xfs/scrub/common.h2
-rw-r--r--fs/xfs/scrub/ialloc.c64
-rw-r--r--fs/xfs/scrub/inode.c4
-rw-r--r--fs/xfs/scrub/refcount.c16
-rw-r--r--fs/xfs/scrub/repair.c54
-rw-r--r--fs/xfs/scrub/repair.h7
-rw-r--r--fs/xfs/scrub/rmap.c35
-rw-r--r--fs/xfs/scrub/scrub.h4
-rw-r--r--fs/xfs/scrub/trace.h131
-rw-r--r--fs/xfs/xfs_aops.h3
-rw-r--r--fs/xfs/xfs_bmap_util.c4
-rw-r--r--fs/xfs/xfs_buf.c1
-rw-r--r--fs/xfs/xfs_extfree_item.c5
-rw-r--r--fs/xfs/xfs_fsops.c3
-rw-r--r--fs/xfs/xfs_inode.c16
-rw-r--r--fs/xfs/xfs_ioctl32.c58
-rw-r--r--fs/xfs/xfs_itable.c14
-rw-r--r--fs/xfs/xfs_log_recover.c8
-rw-r--r--fs/xfs/xfs_mount.c4
-rw-r--r--fs/xfs/xfs_mount.h11
-rw-r--r--fs/xfs/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/xfs_reflink.c232
-rw-r--r--fs/xfs/xfs_rtalloc.c57
-rw-r--r--fs/xfs/xfs_super.c10
-rw-r--r--fs/xfs/xfs_symlink.c33
-rw-r--r--fs/xfs/xfs_trace.h51
-rw-r--r--fs/xfs/xfs_trans.h7
-rw-r--r--fs/xfs/xfs_trans_bmap.c11
-rw-r--r--fs/xfs/xfs_trans_extfree.c40
-rw-r--r--fs/xfs/xfs_trans_refcount.c11
-rw-r--r--fs/xfs/xfs_trans_rmap.c11
348 files changed, 10784 insertions, 6408 deletions
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 43dea3b..8a2562e 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1075,8 +1075,6 @@ static void afs_vnode_new_inode(struct afs_fs_cursor *fc,
if (fc->ac.error < 0)
return;
- d_drop(new_dentry);
-
inode = afs_iget(fc->vnode->vfs_inode.i_sb, fc->key,
newfid, newstatus, newcb, fc->cbi);
if (IS_ERR(inode)) {
@@ -1090,7 +1088,7 @@ static void afs_vnode_new_inode(struct afs_fs_cursor *fc,
vnode = AFS_FS_I(inode);
set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
afs_vnode_commit_status(fc, vnode, 0);
- d_add(new_dentry, inode);
+ d_instantiate(new_dentry, inode);
}
/*
diff --git a/fs/afs/file.c b/fs/afs/file.c
index d6bc3f5..323ae99 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -17,6 +17,7 @@
#include <linux/writeback.h>
#include <linux/gfp.h>
#include <linux/task_io_accounting_ops.h>
+#include <linux/mm.h>
#include "internal.h"
static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
@@ -441,7 +442,7 @@ static int afs_readpages_one(struct file *file, struct address_space *mapping,
/* Count the number of contiguous pages at the front of the list. Note
* that the list goes prev-wards rather than next-wards.
*/
- first = list_entry(pages->prev, struct page, lru);
+ first = lru_to_page(pages);
index = first->index + 1;
n = 1;
for (p = first->lru.prev; p != pages; p = p->prev) {
@@ -473,7 +474,7 @@ static int afs_readpages_one(struct file *file, struct address_space *mapping,
* page at the end of the file.
*/
do {
- page = list_entry(pages->prev, struct page, lru);
+ page = lru_to_page(pages);
list_del(&page->lru);
index = page->index;
if (add_to_page_cache_lru(page, mapping, index,
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index d049cb4..3a9eaec 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -61,8 +61,11 @@ void afs_fileserver_probe_result(struct afs_call *call)
afs_io_error(call, afs_io_error_fs_probe_fail);
goto out;
case -ECONNRESET: /* Responded, but call expired. */
+ case -ERFKILL:
+ case -EADDRNOTAVAIL:
case -ENETUNREACH:
case -EHOSTUNREACH:
+ case -EHOSTDOWN:
case -ECONNREFUSED:
case -ETIMEDOUT:
case -ETIME:
@@ -132,12 +135,14 @@ out:
static int afs_do_probe_fileserver(struct afs_net *net,
struct afs_server *server,
struct key *key,
- unsigned int server_index)
+ unsigned int server_index,
+ struct afs_error *_e)
{
struct afs_addr_cursor ac = {
.index = 0,
};
- int ret;
+ bool in_progress = false;
+ int err;
_enter("%pU", &server->uuid);
@@ -151,15 +156,17 @@ static int afs_do_probe_fileserver(struct afs_net *net,
server->probe.rtt = UINT_MAX;
for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) {
- ret = afs_fs_get_capabilities(net, server, &ac, key, server_index,
+ err = afs_fs_get_capabilities(net, server, &ac, key, server_index,
true);
- if (ret != -EINPROGRESS) {
- afs_fs_probe_done(server);
- return ret;
- }
+ if (err == -EINPROGRESS)
+ in_progress = true;
+ else
+ afs_prioritise_error(_e, err, ac.abort_code);
}
- return 0;
+ if (!in_progress)
+ afs_fs_probe_done(server);
+ return in_progress;
}
/*
@@ -169,21 +176,23 @@ int afs_probe_fileservers(struct afs_net *net, struct key *key,
struct afs_server_list *list)
{
struct afs_server *server;
- int i, ret;
+ struct afs_error e;
+ bool in_progress = false;
+ int i;
+ e.error = 0;
+ e.responded = false;
for (i = 0; i < list->nr_servers; i++) {
server = list->servers[i].server;
if (test_bit(AFS_SERVER_FL_PROBED, &server->flags))
continue;
- if (!test_and_set_bit_lock(AFS_SERVER_FL_PROBING, &server->flags)) {
- ret = afs_do_probe_fileserver(net, server, key, i);
- if (ret)
- return ret;
- }
+ if (!test_and_set_bit_lock(AFS_SERVER_FL_PROBING, &server->flags) &&
+ afs_do_probe_fileserver(net, server, key, i, &e))
+ in_progress = true;
}
- return 0;
+ return in_progress ? 0 : e.error;
}
/*
@@ -238,7 +247,7 @@ int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried)
}
}
- if (!still_probing || unlikely(signal_pending(current)))
+ if (!still_probing || signal_pending(current))
goto stop;
schedule();
}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 4c6d8e1..6b17d36 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -382,7 +382,7 @@ void afs_zap_data(struct afs_vnode *vnode)
int afs_validate(struct afs_vnode *vnode, struct key *key)
{
time64_t now = ktime_get_real_seconds();
- bool valid = false;
+ bool valid;
int ret;
_enter("{v={%llx:%llu} fl=%lx},%x",
@@ -402,15 +402,21 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
vnode->cb_v_break = vnode->volume->cb_v_break;
valid = false;
} else if (vnode->status.type == AFS_FTYPE_DIR &&
- test_bit(AFS_VNODE_DIR_VALID, &vnode->flags) &&
- vnode->cb_expires_at - 10 > now) {
- valid = true;
- } else if (!test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) &&
- vnode->cb_expires_at - 10 > now) {
+ (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags) ||
+ vnode->cb_expires_at - 10 <= now)) {
+ valid = false;
+ } else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) ||
+ vnode->cb_expires_at - 10 <= now) {
+ valid = false;
+ } else {
valid = true;
}
} else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
valid = true;
+ } else {
+ vnode->cb_s_break = vnode->cb_interest->server->cb_s_break;
+ vnode->cb_v_break = vnode->volume->cb_v_break;
+ valid = false;
}
read_sequnlock_excl(&vnode->cb_lock);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 5da3b09..8871b9e 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -696,6 +696,14 @@ struct afs_interface {
};
/*
+ * Error prioritisation and accumulation.
+ */
+struct afs_error {
+ short error; /* Accumulated error */
+ bool responded; /* T if server responded */
+};
+
+/*
* Cursor for iterating over a server's address list.
*/
struct afs_addr_cursor {
@@ -1015,6 +1023,7 @@ static inline void __afs_stat(atomic_t *s)
* misc.c
*/
extern int afs_abort_to_error(u32);
+extern void afs_prioritise_error(struct afs_error *, int, u32);
/*
* mntpt.c
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 700a5fa..bbb1fd5 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -118,3 +118,55 @@ int afs_abort_to_error(u32 abort_code)
default: return -EREMOTEIO;
}
}
+
+/*
+ * Select the error to report from a set of errors.
+ */
+void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code)
+{
+ switch (error) {
+ case 0:
+ return;
+ default:
+ if (e->error == -ETIMEDOUT ||
+ e->error == -ETIME)
+ return;
+ case -ETIMEDOUT:
+ case -ETIME:
+ if (e->error == -ENOMEM ||
+ e->error == -ENONET)
+ return;
+ case -ENOMEM:
+ case -ENONET:
+ if (e->error == -ERFKILL)
+ return;
+ case -ERFKILL:
+ if (e->error == -EADDRNOTAVAIL)
+ return;
+ case -EADDRNOTAVAIL:
+ if (e->error == -ENETUNREACH)
+ return;
+ case -ENETUNREACH:
+ if (e->error == -EHOSTUNREACH)
+ return;
+ case -EHOSTUNREACH:
+ if (e->error == -EHOSTDOWN)
+ return;
+ case -EHOSTDOWN:
+ if (e->error == -ECONNREFUSED)
+ return;
+ case -ECONNREFUSED:
+ if (e->error == -ECONNRESET)
+ return;
+ case -ECONNRESET: /* Responded, but call expired. */
+ if (e->responded)
+ return;
+ e->error = error;
+ return;
+
+ case -ECONNABORTED:
+ e->responded = true;
+ e->error = afs_abort_to_error(abort_code);
+ return;
+ }
+}
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index 0050425..c3ae324 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -136,7 +136,8 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
struct afs_addr_list *alist;
struct afs_server *server;
struct afs_vnode *vnode = fc->vnode;
- u32 rtt, abort_code;
+ struct afs_error e;
+ u32 rtt;
int error = fc->ac.error, i;
_enter("%lx[%d],%lx[%d],%d,%d",
@@ -306,8 +307,11 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
if (fc->error != -EDESTADDRREQ)
goto iterate_address;
/* Fall through */
+ case -ERFKILL:
+ case -EADDRNOTAVAIL:
case -ENETUNREACH:
case -EHOSTUNREACH:
+ case -EHOSTDOWN:
case -ECONNREFUSED:
_debug("no conn");
fc->error = error;
@@ -446,50 +450,15 @@ no_more_servers:
if (fc->flags & AFS_FS_CURSOR_VBUSY)
goto restart_from_beginning;
- abort_code = 0;
- error = -EDESTADDRREQ;
+ e.error = -EDESTADDRREQ;
+ e.responded = false;
for (i = 0; i < fc->server_list->nr_servers; i++) {
struct afs_server *s = fc->server_list->servers[i].server;
- int probe_error = READ_ONCE(s->probe.error);
- switch (probe_error) {
- case 0:
- continue;
- default:
- if (error == -ETIMEDOUT ||
- error == -ETIME)
- continue;
- case -ETIMEDOUT:
- case -ETIME:
- if (error == -ENOMEM ||
- error == -ENONET)
- continue;
- case -ENOMEM:
- case -ENONET:
- if (error == -ENETUNREACH)
- continue;
- case -ENETUNREACH:
- if (error == -EHOSTUNREACH)
- continue;
- case -EHOSTUNREACH:
- if (error == -ECONNREFUSED)
- continue;
- case -ECONNREFUSED:
- if (error == -ECONNRESET)
- continue;
- case -ECONNRESET: /* Responded, but call expired. */
- if (error == -ECONNABORTED)
- continue;
- case -ECONNABORTED:
- abort_code = s->probe.abort_code;
- error = probe_error;
- continue;
- }
+ afs_prioritise_error(&e, READ_ONCE(s->probe.error),
+ s->probe.abort_code);
}
- if (error == -ECONNABORTED)
- error = afs_abort_to_error(abort_code);
-
failed_set_error:
fc->error = error;
failed:
@@ -553,8 +522,11 @@ bool afs_select_current_fileserver(struct afs_fs_cursor *fc)
_leave(" = f [abort]");
return false;
+ case -ERFKILL:
+ case -EADDRNOTAVAIL:
case -ENETUNREACH:
case -EHOSTUNREACH:
+ case -EHOSTDOWN:
case -ECONNREFUSED:
case -ETIMEDOUT:
case -ETIME:
@@ -633,6 +605,7 @@ int afs_end_vnode_operation(struct afs_fs_cursor *fc)
struct afs_net *net = afs_v2net(fc->vnode);
if (fc->error == -EDESTADDRREQ ||
+ fc->error == -EADDRNOTAVAIL ||
fc->error == -ENETUNREACH ||
fc->error == -EHOSTUNREACH)
afs_dump_edestaddrreq(fc);
diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c
index c0f616b..f402ee8 100644
--- a/fs/afs/vl_probe.c
+++ b/fs/afs/vl_probe.c
@@ -61,8 +61,11 @@ void afs_vlserver_probe_result(struct afs_call *call)
afs_io_error(call, afs_io_error_vl_probe_fail);
goto out;
case -ECONNRESET: /* Responded, but call expired. */
+ case -ERFKILL:
+ case -EADDRNOTAVAIL:
case -ENETUNREACH:
case -EHOSTUNREACH:
+ case -EHOSTDOWN:
case -ECONNREFUSED:
case -ETIMEDOUT:
case -ETIME:
@@ -129,15 +132,17 @@ out:
* Probe all of a vlserver's addresses to find out the best route and to
* query its capabilities.
*/
-static int afs_do_probe_vlserver(struct afs_net *net,
- struct afs_vlserver *server,
- struct key *key,
- unsigned int server_index)
+static bool afs_do_probe_vlserver(struct afs_net *net,
+ struct afs_vlserver *server,
+ struct key *key,
+ unsigned int server_index,
+ struct afs_error *_e)
{
struct afs_addr_cursor ac = {
.index = 0,
};
- int ret;
+ bool in_progress = false;
+ int err;
_enter("%s", server->name);
@@ -151,15 +156,17 @@ static int afs_do_probe_vlserver(struct afs_net *net,
server->probe.rtt = UINT_MAX;
for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) {
- ret = afs_vl_get_capabilities(net, &ac, key, server,
+ err = afs_vl_get_capabilities(net, &ac, key, server,
server_index, true);
- if (ret != -EINPROGRESS) {
- afs_vl_probe_done(server);
- return ret;
- }
+ if (err == -EINPROGRESS)
+ in_progress = true;
+ else
+ afs_prioritise_error(_e, err, ac.abort_code);
}
- return 0;
+ if (!in_progress)
+ afs_vl_probe_done(server);
+ return in_progress;
}
/*
@@ -169,21 +176,23 @@ int afs_send_vl_probes(struct afs_net *net, struct key *key,
struct afs_vlserver_list *vllist)
{
struct afs_vlserver *server;
- int i, ret;
+ struct afs_error e;
+ bool in_progress = false;
+ int i;
+ e.error = 0;
+ e.responded = false;
for (i = 0; i < vllist->nr_servers; i++) {
server = vllist->servers[i].server;
if (test_bit(AFS_VLSERVER_FL_PROBED, &server->flags))
continue;
- if (!test_and_set_bit_lock(AFS_VLSERVER_FL_PROBING, &server->flags)) {
- ret = afs_do_probe_vlserver(net, server, key, i);
- if (ret)
- return ret;
- }
+ if (!test_and_set_bit_lock(AFS_VLSERVER_FL_PROBING, &server->flags) &&
+ afs_do_probe_vlserver(net, server, key, i, &e))
+ in_progress = true;
}
- return 0;
+ return in_progress ? 0 : e.error;
}
/*
@@ -239,7 +248,7 @@ int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist,
}
}
- if (!still_probing || unlikely(signal_pending(current)))
+ if (!still_probing || signal_pending(current))
goto stop;
schedule();
}
diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c
index b64a284..7adde83 100644
--- a/fs/afs/vl_rotate.c
+++ b/fs/afs/vl_rotate.c
@@ -71,8 +71,9 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc)
{
struct afs_addr_list *alist;
struct afs_vlserver *vlserver;
+ struct afs_error e;
u32 rtt;
- int error = vc->ac.error, abort_code, i;
+ int error = vc->ac.error, i;
_enter("%lx[%d],%lx[%d],%d,%d",
vc->untried, vc->index,
@@ -119,8 +120,11 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc)
goto failed;
}
+ case -ERFKILL:
+ case -EADDRNOTAVAIL:
case -ENETUNREACH:
case -EHOSTUNREACH:
+ case -EHOSTDOWN:
case -ECONNREFUSED:
case -ETIMEDOUT:
case -ETIME:
@@ -235,50 +239,15 @@ no_more_servers:
if (vc->flags & AFS_VL_CURSOR_RETRY)
goto restart_from_beginning;
- abort_code = 0;
- error = -EDESTADDRREQ;
+ e.error = -EDESTADDRREQ;
+ e.responded = false;
for (i = 0; i < vc->server_list->nr_servers; i++) {
struct afs_vlserver *s = vc->server_list->servers[i].server;
- int probe_error = READ_ONCE(s->probe.error);
- switch (probe_error) {
- case 0:
- continue;
- default:
- if (error == -ETIMEDOUT ||
- error == -ETIME)
- continue;
- case -ETIMEDOUT:
- case -ETIME:
- if (error == -ENOMEM ||
- error == -ENONET)
- continue;
- case -ENOMEM:
- case -ENONET:
- if (error == -ENETUNREACH)
- continue;
- case -ENETUNREACH:
- if (error == -EHOSTUNREACH)
- continue;
- case -EHOSTUNREACH:
- if (error == -ECONNREFUSED)
- continue;
- case -ECONNREFUSED:
- if (error == -ECONNRESET)
- continue;
- case -ECONNRESET: /* Responded, but call expired. */
- if (error == -ECONNABORTED)
- continue;
- case -ECONNABORTED:
- abort_code = s->probe.abort_code;
- error = probe_error;
- continue;
- }
+ afs_prioritise_error(&e, READ_ONCE(s->probe.error),
+ s->probe.abort_code);
}
- if (error == -ECONNABORTED)
- error = afs_abort_to_error(abort_code);
-
failed_set_error:
vc->error = error;
failed:
@@ -341,6 +310,7 @@ int afs_end_vlserver_operation(struct afs_vl_cursor *vc)
struct afs_net *net = vc->cell->net;
if (vc->error == -EDESTADDRREQ ||
+ vc->error == -EADDRNOTAVAIL ||
vc->error == -ENETUNREACH ||
vc->error == -EHOSTUNREACH)
afs_vl_dump_edestaddrreq(vc);
diff --git a/fs/aio.c b/fs/aio.c
index 301e631..b906ff7 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -45,6 +45,7 @@
#include <asm/kmap_types.h>
#include <linux/uaccess.h>
+#include <linux/nospec.h>
#include "internal.h"
@@ -69,6 +70,12 @@ struct aio_ring {
struct io_event io_events[0];
}; /* 128 bytes + ring size */
+/*
+ * Plugging is meant to work with larger batches of IOs. If we don't
+ * have more than the below, then don't bother setting up a plug.
+ */
+#define AIO_PLUG_THRESHOLD 2
+
#define AIO_RING_PAGES 8
struct kioctx_table {
@@ -408,7 +415,7 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
BUG_ON(PageWriteback(old));
get_page(new);
- rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1);
+ rc = migrate_page_move_mapping(mapping, new, old, mode, 1);
if (rc != MIGRATEPAGE_SUCCESS) {
put_page(new);
goto out_unlock;
@@ -901,7 +908,7 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr)
local_irq_restore(flags);
}
-static bool get_reqs_available(struct kioctx *ctx)
+static bool __get_reqs_available(struct kioctx *ctx)
{
struct kioctx_cpu *kcpu;
bool ret = false;
@@ -993,6 +1000,14 @@ static void user_refill_reqs_available(struct kioctx *ctx)
spin_unlock_irq(&ctx->completion_lock);
}
+static bool get_reqs_available(struct kioctx *ctx)
+{
+ if (__get_reqs_available(ctx))
+ return true;
+ user_refill_reqs_available(ctx);
+ return __get_reqs_available(ctx);
+}
+
/* aio_get_req
* Allocate a slot for an aio request.
* Returns NULL if no requests are free.
@@ -1001,24 +1016,16 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
{
struct aio_kiocb *req;
- if (!get_reqs_available(ctx)) {
- user_refill_reqs_available(ctx);
- if (!get_reqs_available(ctx))
- return NULL;
- }
-
- req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
+ req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
if (unlikely(!req))
- goto out_put;
+ return NULL;
percpu_ref_get(&ctx->reqs);
+ req->ki_ctx = ctx;
INIT_LIST_HEAD(&req->ki_list);
refcount_set(&req->ki_refcnt, 0);
- req->ki_ctx = ctx;
+ req->ki_eventfd = NULL;
return req;
-out_put:
- put_reqs_available(ctx, 1);
- return NULL;
}
static struct kioctx *lookup_ioctx(unsigned long ctx_id)
@@ -1038,6 +1045,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
if (!table || id >= table->nr)
goto out;
+ id = array_index_nospec(id, table->nr);
ctx = rcu_dereference(table->table[id]);
if (ctx && ctx->user_id == ctx_id) {
if (percpu_ref_tryget_live(&ctx->users))
@@ -1057,6 +1065,15 @@ static inline void iocb_put(struct aio_kiocb *iocb)
}
}
+static void aio_fill_event(struct io_event *ev, struct aio_kiocb *iocb,
+ long res, long res2)
+{
+ ev->obj = (u64)(unsigned long)iocb->ki_user_iocb;
+ ev->data = iocb->ki_user_data;
+ ev->res = res;
+ ev->res2 = res2;
+}
+
/* aio_complete
* Called when the io request on the given iocb is complete.
*/
@@ -1084,10 +1101,7 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
event = ev_page + pos % AIO_EVENTS_PER_PAGE;
- event->obj = (u64)(unsigned long)iocb->ki_user_iocb;
- event->data = iocb->ki_user_data;
- event->res = res;
- event->res2 = res2;
+ aio_fill_event(event, iocb, res, res2);
kunmap_atomic(ev_page);
flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
@@ -1414,7 +1428,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
aio_complete(iocb, res, res2);
}
-static int aio_prep_rw(struct kiocb *req, struct iocb *iocb)
+static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
{
int ret;
@@ -1436,20 +1450,26 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb)
ret = ioprio_check_cap(iocb->aio_reqprio);
if (ret) {
pr_debug("aio ioprio check cap error: %d\n", ret);
- return ret;
+ goto out_fput;
}
req->ki_ioprio = iocb->aio_reqprio;
} else
- req->ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
+ req->ki_ioprio = get_current_ioprio();
ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
if (unlikely(ret))
- fput(req->ki_filp);
+ goto out_fput;
+
+ req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */
+ return 0;
+
+out_fput:
+ fput(req->ki_filp);
return ret;
}
-static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec,
+static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec,
bool vectored, bool compat, struct iov_iter *iter)
{
void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf;
@@ -1484,12 +1504,12 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
ret = -EINTR;
/*FALLTHRU*/
default:
- aio_complete_rw(req, ret, 0);
+ req->ki_complete(req, ret, 0);
}
}
-static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored,
- bool compat)
+static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb,
+ bool vectored, bool compat)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct iov_iter iter;
@@ -1521,8 +1541,8 @@ out_fput:
return ret;
}
-static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
- bool compat)
+static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb,
+ bool vectored, bool compat)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct iov_iter iter;
@@ -1577,7 +1597,8 @@ static void aio_fsync_work(struct work_struct *work)
aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0);
}
-static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync)
+static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
+ bool datasync)
{
if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes ||
iocb->aio_rw_flags))
@@ -1705,7 +1726,7 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head,
add_wait_queue(head, &pt->iocb->poll.wait);
}
-static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb)
+static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
{
struct kioctx *ctx = aiocb->ki_ctx;
struct poll_iocb *req = &aiocb->poll;
@@ -1725,6 +1746,10 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb)
if (unlikely(!req->file))
return -EBADF;
+ req->head = NULL;
+ req->woken = false;
+ req->cancelled = false;
+
apt.pt._qproc = aio_poll_queue_proc;
apt.pt._key = req->events;
apt.iocb = aiocb;
@@ -1773,44 +1798,44 @@ out:
return 0;
}
-static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
- bool compat)
+static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
+ struct iocb __user *user_iocb, bool compat)
{
struct aio_kiocb *req;
- struct iocb iocb;
ssize_t ret;
- if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb))))
- return -EFAULT;
-
/* enforce forwards compatibility on users */
- if (unlikely(iocb.aio_reserved2)) {
+ if (unlikely(iocb->aio_reserved2)) {
pr_debug("EINVAL: reserve field set\n");
return -EINVAL;
}
/* prevent overflows */
if (unlikely(
- (iocb.aio_buf != (unsigned long)iocb.aio_buf) ||
- (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) ||
- ((ssize_t)iocb.aio_nbytes < 0)
+ (iocb->aio_buf != (unsigned long)iocb->aio_buf) ||
+ (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
+ ((ssize_t)iocb->aio_nbytes < 0)
)) {
pr_debug("EINVAL: overflow check\n");
return -EINVAL;
}
+ if (!get_reqs_available(ctx))
+ return -EAGAIN;
+
+ ret = -EAGAIN;
req = aio_get_req(ctx);
if (unlikely(!req))
- return -EAGAIN;
+ goto out_put_reqs_available;
- if (iocb.aio_flags & IOCB_FLAG_RESFD) {
+ if (iocb->aio_flags & IOCB_FLAG_RESFD) {
/*
* If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
* instance of the file* now. The file descriptor must be
* an eventfd() fd, and will be signaled for each completed
* event using the eventfd_signal() function.
*/
- req->ki_eventfd = eventfd_ctx_fdget((int) iocb.aio_resfd);
+ req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
if (IS_ERR(req->ki_eventfd)) {
ret = PTR_ERR(req->ki_eventfd);
req->ki_eventfd = NULL;
@@ -1825,32 +1850,32 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
}
req->ki_user_iocb = user_iocb;
- req->ki_user_data = iocb.aio_data;
+ req->ki_user_data = iocb->aio_data;
- switch (iocb.aio_lio_opcode) {
+ switch (iocb->aio_lio_opcode) {
case IOCB_CMD_PREAD:
- ret = aio_read(&req->rw, &iocb, false, compat);
+ ret = aio_read(&req->rw, iocb, false, compat);
break;
case IOCB_CMD_PWRITE:
- ret = aio_write(&req->rw, &iocb, false, compat);
+ ret = aio_write(&req->rw, iocb, false, compat);
break;
case IOCB_CMD_PREADV:
- ret = aio_read(&req->rw, &iocb, true, compat);
+ ret = aio_read(&req->rw, iocb, true, compat);
break;
case IOCB_CMD_PWRITEV:
- ret = aio_write(&req->rw, &iocb, true, compat);
+ ret = aio_write(&req->rw, iocb, true, compat);
break;
case IOCB_CMD_FSYNC:
- ret = aio_fsync(&req->fsync, &iocb, false);
+ ret = aio_fsync(&req->fsync, iocb, false);
break;
case IOCB_CMD_FDSYNC:
- ret = aio_fsync(&req->fsync, &iocb, true);
+ ret = aio_fsync(&req->fsync, iocb, true);
break;
case IOCB_CMD_POLL:
- ret = aio_poll(req, &iocb);
+ ret = aio_poll(req, iocb);
break;
default:
- pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode);
+ pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode);
ret = -EINVAL;
break;
}
@@ -1864,14 +1889,25 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
goto out_put_req;
return 0;
out_put_req:
- put_reqs_available(ctx, 1);
- percpu_ref_put(&ctx->reqs);
if (req->ki_eventfd)
eventfd_ctx_put(req->ki_eventfd);
- kmem_cache_free(kiocb_cachep, req);
+ iocb_put(req);
+out_put_reqs_available:
+ put_reqs_available(ctx, 1);
return ret;
}
+static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
+ bool compat)
+{
+ struct iocb iocb;
+
+ if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb))))
+ return -EFAULT;
+
+ return __io_submit_one(ctx, &iocb, user_iocb, compat);
+}
+
/* sys_io_submit:
* Queue the nr iocbs pointed to by iocbpp for processing. Returns
* the number of iocbs queued. May return -EINVAL if the aio_context
@@ -1904,7 +1940,8 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
if (nr > ctx->nr_events)
nr = ctx->nr_events;
- blk_start_plug(&plug);
+ if (nr > AIO_PLUG_THRESHOLD)
+ blk_start_plug(&plug);
for (i = 0; i < nr; i++) {
struct iocb __user *user_iocb;
@@ -1917,7 +1954,8 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
if (ret)
break;
}
- blk_finish_plug(&plug);
+ if (nr > AIO_PLUG_THRESHOLD)
+ blk_finish_plug(&plug);
percpu_ref_put(&ctx->users);
return i ? i : ret;
@@ -1944,7 +1982,8 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
if (nr > ctx->nr_events)
nr = ctx->nr_events;
- blk_start_plug(&plug);
+ if (nr > AIO_PLUG_THRESHOLD)
+ blk_start_plug(&plug);
for (i = 0; i < nr; i++) {
compat_uptr_t user_iocb;
@@ -1957,7 +1996,8 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
if (ret)
break;
}
- blk_finish_plug(&plug);
+ if (nr > AIO_PLUG_THRESHOLD)
+ blk_finish_plug(&plug);
percpu_ref_put(&ctx->users);
return i ? i : ret;
@@ -2062,11 +2102,13 @@ static long do_io_getevents(aio_context_t ctx_id,
* specifies an infinite timeout. Note that the timeout pointed to by
* timeout is relative. Will fail with -ENOSYS if not implemented.
*/
+#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT)
+
SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
long, min_nr,
long, nr,
struct io_event __user *, events,
- struct timespec __user *, timeout)
+ struct __kernel_timespec __user *, timeout)
{
struct timespec64 ts;
int ret;
@@ -2080,6 +2122,8 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
return ret;
}
+#endif
+
struct __aio_sigset {
const sigset_t __user *sigmask;
size_t sigsetsize;
@@ -2090,7 +2134,7 @@ SYSCALL_DEFINE6(io_pgetevents,
long, min_nr,
long, nr,
struct io_event __user *, events,
- struct timespec __user *, timeout,
+ struct __kernel_timespec __user *, timeout,
const struct __aio_sigset __user *, usig)
{
struct __aio_sigset ksig = { NULL, };
@@ -2104,33 +2148,56 @@ SYSCALL_DEFINE6(io_pgetevents,
if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
return -EFAULT;
- if (ksig.sigmask) {
- if (ksig.sigsetsize != sizeof(sigset_t))
- return -EINVAL;
- if (copy_from_user(&ksigmask, ksig.sigmask, sizeof(ksigmask)))
- return -EFAULT;
- sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
- sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
- }
+ ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+ if (ret)
+ return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
- if (signal_pending(current)) {
- if (ksig.sigmask) {
- current->saved_sigmask = sigsaved;
- set_restore_sigmask();
- }
+ restore_user_sigmask(ksig.sigmask, &sigsaved);
+ if (signal_pending(current) && !ret)
+ ret = -ERESTARTNOHAND;
- if (!ret)
- ret = -ERESTARTNOHAND;
- } else {
- if (ksig.sigmask)
- sigprocmask(SIG_SETMASK, &sigsaved, NULL);
- }
+ return ret;
+}
+
+#if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT)
+
+SYSCALL_DEFINE6(io_pgetevents_time32,
+ aio_context_t, ctx_id,
+ long, min_nr,
+ long, nr,
+ struct io_event __user *, events,
+ struct old_timespec32 __user *, timeout,
+ const struct __aio_sigset __user *, usig)
+{
+ struct __aio_sigset ksig = { NULL, };
+ sigset_t ksigmask, sigsaved;
+ struct timespec64 ts;
+ int ret;
+
+ if (timeout && unlikely(get_old_timespec32(&ts, timeout)))
+ return -EFAULT;
+
+ if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
+ return -EFAULT;
+
+
+ ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+ if (ret)
+ return ret;
+
+ ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
+ restore_user_sigmask(ksig.sigmask, &sigsaved);
+ if (signal_pending(current) && !ret)
+ ret = -ERESTARTNOHAND;
return ret;
}
-#ifdef CONFIG_COMPAT
+#endif
+
+#if defined(CONFIG_COMPAT_32BIT_TIME)
+
COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
compat_long_t, min_nr,
compat_long_t, nr,
@@ -2149,12 +2216,17 @@ COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
return ret;
}
+#endif
+
+#ifdef CONFIG_COMPAT
struct __compat_aio_sigset {
compat_sigset_t __user *sigmask;
compat_size_t sigsetsize;
};
+#if defined(CONFIG_COMPAT_32BIT_TIME)
+
COMPAT_SYSCALL_DEFINE6(io_pgetevents,
compat_aio_context_t, ctx_id,
compat_long_t, min_nr,
@@ -2174,27 +2246,47 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
return -EFAULT;
- if (ksig.sigmask) {
- if (ksig.sigsetsize != sizeof(compat_sigset_t))
- return -EINVAL;
- if (get_compat_sigset(&ksigmask, ksig.sigmask))
- return -EFAULT;
- sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
- sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
- }
+ ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+ if (ret)
+ return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
- if (signal_pending(current)) {
- if (ksig.sigmask) {
- current->saved_sigmask = sigsaved;
- set_restore_sigmask();
- }
- if (!ret)
- ret = -ERESTARTNOHAND;
- } else {
- if (ksig.sigmask)
- sigprocmask(SIG_SETMASK, &sigsaved, NULL);
- }
+ restore_user_sigmask(ksig.sigmask, &sigsaved);
+ if (signal_pending(current) && !ret)
+ ret = -ERESTARTNOHAND;
+
+ return ret;
+}
+
+#endif
+
+COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
+ compat_aio_context_t, ctx_id,
+ compat_long_t, min_nr,
+ compat_long_t, nr,
+ struct io_event __user *, events,
+ struct __kernel_timespec __user *, timeout,
+ const struct __compat_aio_sigset __user *, usig)
+{
+ struct __compat_aio_sigset ksig = { NULL, };
+ sigset_t ksigmask, sigsaved;
+ struct timespec64 t;
+ int ret;
+
+ if (timeout && get_timespec64(&t, timeout))
+ return -EFAULT;
+
+ if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
+ return -EFAULT;
+
+ ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+ if (ret)
+ return ret;
+
+ ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
+ restore_user_sigmask(ksig.sigmask, &sigsaved);
+ if (signal_pending(current) && !ret)
+ ret = -ERESTARTNOHAND;
return ret;
}
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 9f9cadb..3e59f0e 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -42,6 +42,8 @@
#endif
#define pr_fmt(fmt) KBUILD_MODNAME ":pid:%d:%s: " fmt, current->pid, __func__
+extern struct file_system_type autofs_fs_type;
+
/*
* Unified info structure. This is pointed to by both the dentry and
* inode structures. Each file in the filesystem has an instance of this
@@ -101,16 +103,19 @@ struct autofs_wait_queue {
#define AUTOFS_SBI_MAGIC 0x6d4a556d
+#define AUTOFS_SBI_CATATONIC 0x0001
+#define AUTOFS_SBI_STRICTEXPIRE 0x0002
+
struct autofs_sb_info {
u32 magic;
int pipefd;
struct file *pipe;
struct pid *oz_pgrp;
- int catatonic;
int version;
int sub_version;
int min_proto;
int max_proto;
+ unsigned int flags;
unsigned long exp_timeout;
unsigned int type;
struct super_block *sb;
@@ -126,8 +131,7 @@ struct autofs_sb_info {
static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb)
{
- return sb->s_magic != AUTOFS_SUPER_MAGIC ?
- NULL : (struct autofs_sb_info *)(sb->s_fs_info);
+ return (struct autofs_sb_info *)(sb->s_fs_info);
}
static inline struct autofs_info *autofs_dentry_ino(struct dentry *dentry)
@@ -141,7 +145,8 @@ static inline struct autofs_info *autofs_dentry_ino(struct dentry *dentry)
*/
static inline int autofs_oz_mode(struct autofs_sb_info *sbi)
{
- return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
+ return ((sbi->flags & AUTOFS_SBI_CATATONIC) ||
+ task_pgrp(current) == sbi->oz_pgrp);
}
struct inode *autofs_get_inode(struct super_block *, umode_t);
diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index 86eafda..e9fe74d 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -151,22 +151,6 @@ out:
return err;
}
-/*
- * Get the autofs super block info struct from the file opened on
- * the autofs mount point.
- */
-static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f)
-{
- struct autofs_sb_info *sbi = NULL;
- struct inode *inode;
-
- if (f) {
- inode = file_inode(f);
- sbi = autofs_sbi(inode->i_sb);
- }
- return sbi;
-}
-
/* Return autofs dev ioctl version */
static int autofs_dev_ioctl_version(struct file *fp,
struct autofs_sb_info *sbi,
@@ -366,7 +350,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
pipefd = param->setpipefd.pipefd;
mutex_lock(&sbi->wq_mutex);
- if (!sbi->catatonic) {
+ if (!(sbi->flags & AUTOFS_SBI_CATATONIC)) {
mutex_unlock(&sbi->wq_mutex);
return -EBUSY;
} else {
@@ -393,7 +377,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
swap(sbi->oz_pgrp, new_pid);
sbi->pipefd = pipefd;
sbi->pipe = pipe;
- sbi->catatonic = 0;
+ sbi->flags &= ~AUTOFS_SBI_CATATONIC;
}
out:
put_pid(new_pid);
@@ -658,6 +642,8 @@ static int _autofs_dev_ioctl(unsigned int command,
if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD &&
cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD &&
cmd != AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD) {
+ struct super_block *sb;
+
fp = fget(param->ioctlfd);
if (!fp) {
if (cmd == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD)
@@ -666,12 +652,13 @@ static int _autofs_dev_ioctl(unsigned int command,
goto out;
}
- sbi = autofs_dev_ioctl_sbi(fp);
- if (!sbi || sbi->magic != AUTOFS_SBI_MAGIC) {
+ sb = file_inode(fp)->i_sb;
+ if (sb->s_type != &autofs_fs_type) {
err = -EINVAL;
fput(fp);
goto out;
}
+ sbi = autofs_sbi(sb);
/*
* Admin needs to be able to set the mount catatonic in
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
index 79ae07d..c0c1db2 100644
--- a/fs/autofs/init.c
+++ b/fs/autofs/init.c
@@ -16,7 +16,7 @@ static struct dentry *autofs_mount(struct file_system_type *fs_type,
return mount_nodev(fs_type, flags, data, autofs_fill_super);
}
-static struct file_system_type autofs_fs_type = {
+struct file_system_type autofs_fs_type = {
.owner = THIS_MODULE,
.name = "autofs",
.mount = autofs_mount,
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 846c052..0e8ea2d 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -87,6 +87,8 @@ static int autofs_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",direct");
else
seq_printf(m, ",indirect");
+ if (sbi->flags & AUTOFS_SBI_STRICTEXPIRE)
+ seq_printf(m, ",strictexpire");
#ifdef CONFIG_CHECKPOINT_RESTORE
if (sbi->pipe)
seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino);
@@ -109,7 +111,7 @@ static const struct super_operations autofs_sops = {
};
enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
- Opt_indirect, Opt_direct, Opt_offset};
+ Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire};
static const match_table_t tokens = {
{Opt_fd, "fd=%u"},
@@ -121,24 +123,28 @@ static const match_table_t tokens = {
{Opt_indirect, "indirect"},
{Opt_direct, "direct"},
{Opt_offset, "offset"},
+ {Opt_strictexpire, "strictexpire"},
{Opt_err, NULL}
};
-static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
- int *pgrp, bool *pgrp_set, unsigned int *type,
- int *minproto, int *maxproto)
+static int parse_options(char *options,
+ struct inode *root, int *pgrp, bool *pgrp_set,
+ struct autofs_sb_info *sbi)
{
char *p;
substring_t args[MAX_OPT_ARGS];
int option;
+ int pipefd = -1;
+ kuid_t uid;
+ kgid_t gid;
- *uid = current_uid();
- *gid = current_gid();
+ root->i_uid = current_uid();
+ root->i_gid = current_gid();
- *minproto = AUTOFS_MIN_PROTO_VERSION;
- *maxproto = AUTOFS_MAX_PROTO_VERSION;
+ sbi->min_proto = AUTOFS_MIN_PROTO_VERSION;
+ sbi->max_proto = AUTOFS_MAX_PROTO_VERSION;
- *pipefd = -1;
+ sbi->pipefd = -1;
if (!options)
return 1;
@@ -152,22 +158,25 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
token = match_token(p, tokens, args);
switch (token) {
case Opt_fd:
- if (match_int(args, pipefd))
+ if (match_int(args, &pipefd))
return 1;
+ sbi->pipefd = pipefd;
break;
case Opt_uid:
if (match_int(args, &option))
return 1;
- *uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(*uid))
+ uid = make_kuid(current_user_ns(), option);
+ if (!uid_valid(uid))
return 1;
+ root->i_uid = uid;
break;
case Opt_gid:
if (match_int(args, &option))
return 1;
- *gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(*gid))
+ gid = make_kgid(current_user_ns(), option);
+ if (!gid_valid(gid))
return 1;
+ root->i_gid = gid;
break;
case Opt_pgrp:
if (match_int(args, &option))
@@ -178,27 +187,30 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
case Opt_minproto:
if (match_int(args, &option))
return 1;
- *minproto = option;
+ sbi->min_proto = option;
break;
case Opt_maxproto:
if (match_int(args, &option))
return 1;
- *maxproto = option;
+ sbi->max_proto = option;
break;
case Opt_indirect:
- set_autofs_type_indirect(type);
+ set_autofs_type_indirect(&sbi->type);
break;
case Opt_direct:
- set_autofs_type_direct(type);
+ set_autofs_type_direct(&sbi->type);
break;
case Opt_offset:
- set_autofs_type_offset(type);
+ set_autofs_type_offset(&sbi->type);
+ break;
+ case Opt_strictexpire:
+ sbi->flags |= AUTOFS_SBI_STRICTEXPIRE;
break;
default:
return 1;
}
}
- return (*pipefd < 0);
+ return (sbi->pipefd < 0);
}
int autofs_fill_super(struct super_block *s, void *data, int silent)
@@ -206,7 +218,6 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
struct inode *root_inode;
struct dentry *root;
struct file *pipe;
- int pipefd;
struct autofs_sb_info *sbi;
struct autofs_info *ino;
int pgrp = 0;
@@ -222,12 +233,12 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
sbi->magic = AUTOFS_SBI_MAGIC;
sbi->pipefd = -1;
sbi->pipe = NULL;
- sbi->catatonic = 1;
sbi->exp_timeout = 0;
sbi->oz_pgrp = NULL;
sbi->sb = s;
sbi->version = 0;
sbi->sub_version = 0;
+ sbi->flags = AUTOFS_SBI_CATATONIC;
set_autofs_type_indirect(&sbi->type);
sbi->min_proto = 0;
sbi->max_proto = 0;
@@ -262,9 +273,7 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
root->d_fsdata = ino;
/* Can this call block? */
- if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid,
- &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto,
- &sbi->max_proto)) {
+ if (parse_options(data, root_inode, &pgrp, &pgrp_set, sbi)) {
pr_err("called with bogus options\n");
goto fail_dput;
}
@@ -303,8 +312,9 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
root_inode->i_fop = &autofs_root_operations;
root_inode->i_op = &autofs_dir_inode_operations;
- pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp));
- pipe = fget(pipefd);
+ pr_debug("pipe fd = %d, pgrp = %u\n",
+ sbi->pipefd, pid_nr(sbi->oz_pgrp));
+ pipe = fget(sbi->pipefd);
if (!pipe) {
pr_err("could not open pipe file descriptor\n");
@@ -314,8 +324,7 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
if (ret < 0)
goto fail_fput;
sbi->pipe = pipe;
- sbi->pipefd = pipefd;
- sbi->catatonic = 0;
+ sbi->flags &= ~AUTOFS_SBI_CATATONIC;
/*
* Success! Install the root dentry now to indicate completion.
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 782e57b..1246f39 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -275,8 +275,11 @@ static int autofs_mount_wait(const struct path *path, bool rcu_walk)
pr_debug("waiting for mount name=%pd\n", path->dentry);
status = autofs_wait(sbi, path, NFY_MOUNT);
pr_debug("mount wait done status=%d\n", status);
+ ino->last_used = jiffies;
+ return status;
}
- ino->last_used = jiffies;
+ if (!(sbi->flags & AUTOFS_SBI_STRICTEXPIRE))
+ ino->last_used = jiffies;
return status;
}
@@ -510,7 +513,8 @@ static struct dentry *autofs_lookup(struct inode *dir,
sbi = autofs_sbi(dir->i_sb);
pr_debug("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n",
- current->pid, task_pgrp_nr(current), sbi->catatonic,
+ current->pid, task_pgrp_nr(current),
+ sbi->flags & AUTOFS_SBI_CATATONIC,
autofs_oz_mode(sbi));
active = autofs_lookup_active(dentry);
@@ -563,7 +567,7 @@ static int autofs_dir_symlink(struct inode *dir,
* autofs mount is catatonic but the state of an autofs
* file system needs to be preserved over restarts.
*/
- if (sbi->catatonic)
+ if (sbi->flags & AUTOFS_SBI_CATATONIC)
return -EACCES;
BUG_ON(!ino);
@@ -626,7 +630,7 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry)
* autofs mount is catatonic but the state of an autofs
* file system needs to be preserved over restarts.
*/
- if (sbi->catatonic)
+ if (sbi->flags & AUTOFS_SBI_CATATONIC)
return -EACCES;
if (atomic_dec_and_test(&ino->count)) {
@@ -714,7 +718,7 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry)
* autofs mount is catatonic but the state of an autofs
* file system needs to be preserved over restarts.
*/
- if (sbi->catatonic)
+ if (sbi->flags & AUTOFS_SBI_CATATONIC)
return -EACCES;
spin_lock(&sbi->lookup_lock);
@@ -759,7 +763,7 @@ static int autofs_dir_mkdir(struct inode *dir,
* autofs mount is catatonic but the state of an autofs
* file system needs to be preserved over restarts.
*/
- if (sbi->catatonic)
+ if (sbi->flags & AUTOFS_SBI_CATATONIC)
return -EACCES;
pr_debug("dentry %p, creating %pd\n", dentry, dentry);
diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
index f6385c6..15a3e31 100644
--- a/fs/autofs/waitq.c
+++ b/fs/autofs/waitq.c
@@ -20,14 +20,14 @@ void autofs_catatonic_mode(struct autofs_sb_info *sbi)
struct autofs_wait_queue *wq, *nwq;
mutex_lock(&sbi->wq_mutex);
- if (sbi->catatonic) {
+ if (sbi->flags & AUTOFS_SBI_CATATONIC) {
mutex_unlock(&sbi->wq_mutex);
return;
}
pr_debug("entering catatonic mode\n");
- sbi->catatonic = 1;
+ sbi->flags |= AUTOFS_SBI_CATATONIC;
wq = sbi->queues;
sbi->queues = NULL; /* Erase all wait queues */
while (wq) {
@@ -255,7 +255,7 @@ static int validate_request(struct autofs_wait_queue **wait,
struct autofs_wait_queue *wq;
struct autofs_info *ino;
- if (sbi->catatonic)
+ if (sbi->flags & AUTOFS_SBI_CATATONIC)
return -ENOENT;
/* Wait in progress, continue; */
@@ -290,7 +290,7 @@ static int validate_request(struct autofs_wait_queue **wait,
if (mutex_lock_interruptible(&sbi->wq_mutex))
return -EINTR;
- if (sbi->catatonic)
+ if (sbi->flags & AUTOFS_SBI_CATATONIC)
return -ENOENT;
wq = autofs_find_wait(sbi, qstr);
@@ -359,7 +359,7 @@ int autofs_wait(struct autofs_sb_info *sbi,
pid_t tgid;
/* In catatonic mode, we don't wait for nobody */
- if (sbi->catatonic)
+ if (sbi->flags & AUTOFS_SBI_CATATONIC)
return -ENOENT;
/*
diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index 67aef3b..606f937 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -1,13 +1,20 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* fs/bfs/bfs.h
- * Copyright (C) 1999 Tigran Aivazian <tigran@veritas.com>
+ * Copyright (C) 1999-2018 Tigran Aivazian <aivazian.tigran@gmail.com>
*/
#ifndef _FS_BFS_BFS_H
#define _FS_BFS_BFS_H
#include <linux/bfs_fs.h>
+/* In theory BFS supports up to 512 inodes, numbered from 2 (for /) up to 513 inclusive.
+ In actual fact, attempting to create the 512th inode (i.e. inode No. 513 or file No. 511)
+ will fail with ENOSPC in bfs_add_entry(): the root directory cannot contain so many entries, counting '..'.
+ So, mkfs.bfs(8) should really limit its -N option to 511 and not 512. For now, we just print a warning
+ if a filesystem is mounted with such "impossible to fill up" number of inodes */
+#define BFS_MAX_LASTI 513
+
/*
* BFS file system in-core superblock info
*/
@@ -17,7 +24,7 @@ struct bfs_sb_info {
unsigned long si_freei;
unsigned long si_lf_eblk;
unsigned long si_lasti;
- unsigned long *si_imap;
+ DECLARE_BITMAP(si_imap, BFS_MAX_LASTI+1);
struct mutex bfs_lock;
};
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index f32f21c..d8dfe3a 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -2,8 +2,8 @@
/*
* fs/bfs/dir.c
* BFS directory operations.
- * Copyright (C) 1999,2000 Tigran Aivazian <tigran@veritas.com>
- * Made endianness-clean by Andrew Stribblehill <ads@wompom.org> 2005
+ * Copyright (C) 1999-2018 Tigran Aivazian <aivazian.tigran@gmail.com>
+ * Made endianness-clean by Andrew Stribblehill <ads@wompom.org> 2005
*/
#include <linux/time.h>
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 1476cdd..0dceefc 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -2,7 +2,7 @@
/*
* fs/bfs/file.c
* BFS file operations.
- * Copyright (C) 1999,2000 Tigran Aivazian <tigran@veritas.com>
+ * Copyright (C) 1999-2018 Tigran Aivazian <aivazian.tigran@gmail.com>
*
* Make the file block allocation algorithm understand the size
* of the underlying block device.
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index d81c148..d136b2a 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -1,10 +1,9 @@
/*
* fs/bfs/inode.c
* BFS superblock and inode operations.
- * Copyright (C) 1999-2006 Tigran Aivazian <aivazian.tigran@gmail.com>
+ * Copyright (C) 1999-2018 Tigran Aivazian <aivazian.tigran@gmail.com>
* From fs/minix, Copyright (C) 1991, 1992 Linus Torvalds.
- *
- * Made endianness-clean by Andrew Stribblehill <ads@wompom.org>, 2005.
+ * Made endianness-clean by Andrew Stribblehill <ads@wompom.org>, 2005.
*/
#include <linux/module.h>
@@ -118,12 +117,12 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct bfs_sb_info *info = BFS_SB(inode->i_sb);
unsigned int ino = (u16)inode->i_ino;
- unsigned long i_sblock;
+ unsigned long i_sblock;
struct bfs_inode *di;
struct buffer_head *bh;
int err = 0;
- dprintf("ino=%08x\n", ino);
+ dprintf("ino=%08x\n", ino);
di = find_inode(inode->i_sb, ino, &bh);
if (IS_ERR(di))
@@ -144,7 +143,7 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
di->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
di->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
- i_sblock = BFS_I(inode)->i_sblock;
+ i_sblock = BFS_I(inode)->i_sblock;
di->i_sblock = cpu_to_le32(i_sblock);
di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock);
di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
@@ -188,13 +187,13 @@ static void bfs_evict_inode(struct inode *inode)
mark_buffer_dirty(bh);
brelse(bh);
- if (bi->i_dsk_ino) {
+ if (bi->i_dsk_ino) {
if (bi->i_sblock)
info->si_freeb += bi->i_eblock + 1 - bi->i_sblock;
info->si_freei++;
clear_bit(ino, info->si_imap);
- bfs_dump_imap("delete_inode", s);
- }
+ bfs_dump_imap("evict_inode", s);
+ }
/*
* If this was the last file, make the previous block
@@ -214,7 +213,6 @@ static void bfs_put_super(struct super_block *s)
return;
mutex_destroy(&info->bfs_lock);
- kfree(info->si_imap);
kfree(info);
s->s_fs_info = NULL;
}
@@ -311,8 +309,7 @@ void bfs_dump_imap(const char *prefix, struct super_block *s)
else
strcat(tmpbuf, "0");
}
- printf("BFS-fs: %s: lasti=%08lx <%s>\n",
- prefix, BFS_SB(s)->si_lasti, tmpbuf);
+ printf("%s: lasti=%08lx <%s>\n", prefix, BFS_SB(s)->si_lasti, tmpbuf);
free_page((unsigned long)tmpbuf);
#endif
}
@@ -322,7 +319,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
struct buffer_head *bh, *sbh;
struct bfs_super_block *bfs_sb;
struct inode *inode;
- unsigned i, imap_len;
+ unsigned i;
struct bfs_sb_info *info;
int ret = -EINVAL;
unsigned long i_sblock, i_eblock, i_eoff, s_size;
@@ -341,8 +338,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
bfs_sb = (struct bfs_super_block *)sbh->b_data;
if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) {
if (!silent)
- printf("No BFS filesystem on %s (magic=%08x)\n",
- s->s_id, le32_to_cpu(bfs_sb->s_magic));
+ printf("No BFS filesystem on %s (magic=%08x)\n", s->s_id, le32_to_cpu(bfs_sb->s_magic));
goto out1;
}
if (BFS_UNCLEAN(bfs_sb, s) && !silent)
@@ -351,18 +347,16 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
s->s_magic = BFS_MAGIC;
if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end) ||
- le32_to_cpu(bfs_sb->s_start) < BFS_BSIZE) {
- printf("Superblock is corrupted\n");
+ le32_to_cpu(bfs_sb->s_start) < sizeof(struct bfs_super_block) + sizeof(struct bfs_dirent)) {
+ printf("Superblock is corrupted on %s\n", s->s_id);
goto out1;
}
- info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) /
- sizeof(struct bfs_inode)
- + BFS_ROOT_INO - 1;
- imap_len = (info->si_lasti / 8) + 1;
- info->si_imap = kzalloc(imap_len, GFP_KERNEL | __GFP_NOWARN);
- if (!info->si_imap) {
- printf("Cannot allocate %u bytes\n", imap_len);
+ info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / sizeof(struct bfs_inode) + BFS_ROOT_INO - 1;
+ if (info->si_lasti == BFS_MAX_LASTI)
+ printf("WARNING: filesystem %s was created with 512 inodes, the real maximum is 511, mounting anyway\n", s->s_id);
+ else if (info->si_lasti > BFS_MAX_LASTI) {
+ printf("Impossible last inode number %lu > %d on %s\n", info->si_lasti, BFS_MAX_LASTI, s->s_id);
goto out1;
}
for (i = 0; i < BFS_ROOT_INO; i++)
@@ -372,26 +366,25 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
inode = bfs_iget(s, BFS_ROOT_INO);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
- goto out2;
+ goto out1;
}
s->s_root = d_make_root(inode);
if (!s->s_root) {
ret = -ENOMEM;
- goto out2;
+ goto out1;
}
info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1) >> BFS_BSIZE_BITS;
- info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1
- - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS;
+ info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1 - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS;
info->si_freei = 0;
info->si_lf_eblk = 0;
/* can we read the last block? */
bh = sb_bread(s, info->si_blocks - 1);
if (!bh) {
- printf("Last block not available: %lu\n", info->si_blocks - 1);
+ printf("Last block not available on %s: %lu\n", s->s_id, info->si_blocks - 1);
ret = -EIO;
- goto out3;
+ goto out2;
}
brelse(bh);
@@ -425,11 +418,11 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
(i_eoff != le32_to_cpu(-1) && i_eoff > s_size) ||
i_sblock * BFS_BSIZE > i_eoff) {
- printf("Inode 0x%08x corrupted\n", i);
+ printf("Inode 0x%08x corrupted on %s\n", i, s->s_id);
brelse(bh);
ret = -EIO;
- goto out3;
+ goto out2;
}
if (!di->i_ino) {
@@ -445,14 +438,12 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
}
brelse(bh);
brelse(sbh);
- bfs_dump_imap("read_super", s);
+ bfs_dump_imap("fill_super", s);
return 0;
-out3:
+out2:
dput(s->s_root);
s->s_root = NULL;
-out2:
- kfree(info->si_imap);
out1:
brelse(sbh);
out:
@@ -482,7 +473,7 @@ static int __init init_bfs_fs(void)
int err = init_inodecache();
if (err)
goto out1;
- err = register_filesystem(&bfs_fs_type);
+ err = register_filesystem(&bfs_fs_type);
if (err)
goto out;
return 0;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index c3deb2e..ca9725f 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -78,9 +78,9 @@ static int aout_core_dump(struct coredump_params *cprm)
/* make sure we actually have a data and stack area to dump */
set_fs(USER_DS);
- if (!access_ok(VERIFY_READ, START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
+ if (!access_ok(START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
dump.u_dsize = 0;
- if (!access_ok(VERIFY_READ, START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
+ if (!access_ok(START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
dump.u_ssize = 0;
set_fs(KERNEL_DS);
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 7cde3f4..d0078cb 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -42,10 +42,14 @@ static int load_script(struct linux_binprm *bprm)
fput(bprm->file);
bprm->file = NULL;
- bprm->buf[BINPRM_BUF_SIZE - 1] = '\0';
- if ((cp = strchr(bprm->buf, '\n')) == NULL)
- cp = bprm->buf+BINPRM_BUF_SIZE-1;
+ for (cp = bprm->buf+2;; cp++) {
+ if (cp >= bprm->buf + BINPRM_BUF_SIZE)
+ return -ENOEXEC;
+ if (!*cp || (*cp == '\n'))
+ break;
+ }
*cp = '\0';
+
while (cp > bprm->buf) {
cp--;
if ((*cp == ' ') || (*cp == '\t'))
diff --git a/fs/block_dev.c b/fs/block_dev.c
index a80b4f0..c546cdc 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -181,7 +181,7 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
struct task_struct *waiter = bio->bi_private;
WRITE_ONCE(bio->bi_private, NULL);
- wake_up_process(waiter);
+ blk_wake_io_task(waiter);
}
static ssize_t
@@ -232,6 +232,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
bio.bi_opf = dio_bio_write_op(iocb);
task_io_account_write(ret);
}
+ if (iocb->ki_flags & IOCB_HIPRI)
+ bio.bi_opf |= REQ_HIPRI;
qc = submit_bio(&bio);
for (;;) {
@@ -239,7 +241,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
if (!READ_ONCE(bio.bi_private))
break;
if (!(iocb->ki_flags & IOCB_HIPRI) ||
- !blk_poll(bdev_get_queue(bdev), qc))
+ !blk_poll(bdev_get_queue(bdev), qc, true))
io_schedule();
}
__set_current_state(TASK_RUNNING);
@@ -298,12 +300,13 @@ static void blkdev_bio_end_io(struct bio *bio)
}
dio->iocb->ki_complete(iocb, ret, 0);
- bio_put(&dio->bio);
+ if (dio->multi_bio)
+ bio_put(&dio->bio);
} else {
struct task_struct *waiter = dio->waiter;
WRITE_ONCE(dio->waiter, NULL);
- wake_up_process(waiter);
+ blk_wake_io_task(waiter);
}
}
@@ -328,6 +331,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
struct blk_plug plug;
struct blkdev_dio *dio;
struct bio *bio;
+ bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
bool is_read = (iov_iter_rw(iter) == READ), is_sync;
loff_t pos = iocb->ki_pos;
blk_qc_t qc = BLK_QC_T_NONE;
@@ -338,20 +342,27 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
return -EINVAL;
bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
- bio_get(bio); /* extra ref for the completion handler */
dio = container_of(bio, struct blkdev_dio, bio);
dio->is_sync = is_sync = is_sync_kiocb(iocb);
- if (dio->is_sync)
+ if (dio->is_sync) {
dio->waiter = current;
- else
+ bio_get(bio);
+ } else {
dio->iocb = iocb;
+ }
dio->size = 0;
dio->multi_bio = false;
dio->should_dirty = is_read && iter_is_iovec(iter);
- blk_start_plug(&plug);
+ /*
+ * Don't plug for HIPRI/polled IO, as those should go straight
+ * to issue
+ */
+ if (!is_poll)
+ blk_start_plug(&plug);
+
for (;;) {
bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = pos >> 9;
@@ -381,11 +392,21 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
if (!nr_pages) {
+ if (iocb->ki_flags & IOCB_HIPRI)
+ bio->bi_opf |= REQ_HIPRI;
+
qc = submit_bio(bio);
break;
}
if (!dio->multi_bio) {
+ /*
+ * AIO needs an extra reference to ensure the dio
+ * structure which is embedded into the first bio
+ * stays around.
+ */
+ if (!is_sync)
+ bio_get(bio);
dio->multi_bio = true;
atomic_set(&dio->ref, 2);
} else {
@@ -395,7 +416,9 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
submit_bio(bio);
bio = bio_alloc(GFP_KERNEL, nr_pages);
}
- blk_finish_plug(&plug);
+
+ if (!is_poll)
+ blk_finish_plug(&plug);
if (!is_sync)
return -EIOCBQUEUED;
@@ -406,7 +429,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
break;
if (!(iocb->ki_flags & IOCB_HIPRI) ||
- !blk_poll(bdev_get_queue(bdev), qc))
+ !blk_poll(bdev_get_queue(bdev), qc, true))
io_schedule();
}
__set_current_state(TASK_RUNNING);
@@ -1966,6 +1989,7 @@ static const struct address_space_operations def_blk_aops = {
.writepages = blkdev_writepages,
.releasepage = blkdev_releasepage,
.direct_IO = blkdev_direct_IO,
+ .migratepage = buffer_migrate_page_norefs,
.is_dirty_writeback = buffer_check_dirty_writeback,
};
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 68ebe18..7855644 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -591,7 +591,7 @@ unode_aux_to_inode_list(struct ulist_node *node)
}
/*
- * We maintain three seperate rbtrees: one for direct refs, one for
+ * We maintain three separate rbtrees: one for direct refs, one for
* indirect refs which have a key, and one for indirect refs which do not
* have a key. Each tree does merge on insertion.
*
@@ -695,7 +695,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
}
/*
- * Now it's a direct ref, put it in the the direct tree. We must
+ * Now it's a direct ref, put it in the direct tree. We must
* do this last because the ref could be merged/freed here.
*/
prelim_ref_insert(fs_info, &preftrees->direct, ref, NULL);
@@ -2020,9 +2020,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
ret = -ENOMEM;
break;
}
- extent_buffer_get(eb);
- btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
btrfs_release_path(path);
item = btrfs_item_nr(slot);
@@ -2042,7 +2039,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
len = sizeof(*iref) + name_len;
iref = (struct btrfs_inode_ref *)((char *)iref + len);
}
- btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
}
@@ -2083,10 +2079,6 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
ret = -ENOMEM;
break;
}
- extent_buffer_get(eb);
-
- btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
btrfs_release_path(path);
item_size = btrfs_item_size_nr(eb, slot);
@@ -2107,7 +2099,6 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
cur_offset += btrfs_inode_extref_name_len(eb, extref);
cur_offset += sizeof(*extref);
}
- btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
offset++;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 97d91e5..6f5d074 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -20,7 +20,7 @@
* new data the application may have written before commit.
*/
enum {
- BTRFS_INODE_ORDERED_DATA_CLOSE = 0,
+ BTRFS_INODE_ORDERED_DATA_CLOSE,
BTRFS_INODE_DUMMY,
BTRFS_INODE_IN_DEFRAG,
BTRFS_INODE_HAS_ASYNC_EXTENT,
@@ -29,6 +29,7 @@ enum {
BTRFS_INODE_IN_DELALLOC_LIST,
BTRFS_INODE_READDIO_NEED_LOCK,
BTRFS_INODE_HAS_PROPS,
+ BTRFS_INODE_SNAPSHOT_FLUSH,
};
/* in memory btrfs inode */
@@ -147,6 +148,12 @@ struct btrfs_inode {
u64 last_unlink_trans;
/*
+ * Track the transaction id of the last transaction used to create a
+ * hard link for the inode. This is used by the log tree (fsync).
+ */
+ u64 last_link_trans;
+
+ /*
* Number of bytes outstanding that are going to need csums. This is
* used in ENOSPC accounting.
*/
@@ -253,6 +260,11 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode)
return false;
}
+static inline bool is_data_inode(struct inode *inode)
+{
+ return btrfs_ino(BTRFS_I(inode)) != BTRFS_BTREE_INODE_OBJECTID;
+}
+
static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
int mod)
{
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 2e43fba4..b0c8094 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1202,24 +1202,24 @@ static void btrfsic_read_from_block_data(
void *dstv, u32 offset, size_t len)
{
size_t cur;
- size_t offset_in_page;
+ size_t pgoff;
char *kaddr;
char *dst = (char *)dstv;
- size_t start_offset = block_ctx->start & ((u64)PAGE_SIZE - 1);
+ size_t start_offset = offset_in_page(block_ctx->start);
unsigned long i = (start_offset + offset) >> PAGE_SHIFT;
WARN_ON(offset + len > block_ctx->len);
- offset_in_page = (start_offset + offset) & (PAGE_SIZE - 1);
+ pgoff = offset_in_page(start_offset + offset);
while (len > 0) {
- cur = min(len, ((size_t)PAGE_SIZE - offset_in_page));
+ cur = min(len, ((size_t)PAGE_SIZE - pgoff));
BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_SIZE));
kaddr = block_ctx->datav[i];
- memcpy(dst, kaddr + offset_in_page, cur);
+ memcpy(dst, kaddr + pgoff, cur);
dst += cur;
len -= cur;
- offset_in_page = 0;
+ pgoff = 0;
i++;
}
}
@@ -1601,7 +1601,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
BUG_ON(block_ctx->datav);
BUG_ON(block_ctx->pagev);
BUG_ON(block_ctx->mem_to_free);
- if (block_ctx->dev_bytenr & ((u64)PAGE_SIZE - 1)) {
+ if (!PAGE_ALIGNED(block_ctx->dev_bytenr)) {
pr_info("btrfsic: read_block() with unaligned bytenr %llu\n",
block_ctx->dev_bytenr);
return -1;
@@ -1720,7 +1720,7 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state,
num_pages = state->metablock_size >> PAGE_SHIFT;
h = (struct btrfs_header *)datav[0];
- if (memcmp(h->fsid, fs_info->fsid, BTRFS_FSID_SIZE))
+ if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE))
return 1;
for (i = 0; i < num_pages; i++) {
@@ -1778,7 +1778,7 @@ again:
return;
}
is_metadata = 1;
- BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_SIZE - 1));
+ BUG_ON(!PAGE_ALIGNED(BTRFS_SUPER_INFO_SIZE));
processed_len = BTRFS_SUPER_INFO_SIZE;
if (state->print_mask &
BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
@@ -2327,7 +2327,7 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
* write operations. Therefore it keeps the linkage
* information for a block until a block is
* rewritten. This can temporarily cause incorrect
- * and even circular linkage informations. This
+ * and even circular linkage information. This
* causes no harm unless such blocks are referenced
* by the most recent super block.
*/
@@ -2892,12 +2892,12 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
struct list_head *dev_head = &fs_devices->devices;
struct btrfs_device *device;
- if (fs_info->nodesize & ((u64)PAGE_SIZE - 1)) {
+ if (!PAGE_ALIGNED(fs_info->nodesize)) {
pr_info("btrfsic: cannot handle nodesize %d not being a multiple of PAGE_SIZE %ld!\n",
fs_info->nodesize, PAGE_SIZE);
return -1;
}
- if (fs_info->sectorsize & ((u64)PAGE_SIZE - 1)) {
+ if (!PAGE_ALIGNED(fs_info->sectorsize)) {
pr_info("btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_SIZE %ld!\n",
fs_info->sectorsize, PAGE_SIZE);
return -1;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 2955a4e..5480576 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -229,7 +229,6 @@ static noinline void end_compressed_writeback(struct inode *inode,
*/
static void end_compressed_bio_write(struct bio *bio)
{
- struct extent_io_tree *tree;
struct compressed_bio *cb = bio->bi_private;
struct inode *inode;
struct page *page;
@@ -248,14 +247,10 @@ static void end_compressed_bio_write(struct bio *bio)
* call back into the FS and do all the end_io operations
*/
inode = cb->inode;
- tree = &BTRFS_I(inode)->io_tree;
cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
- tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
- cb->start,
- cb->start + cb->len - 1,
- NULL,
- bio->bi_status ?
- BLK_STS_OK : BLK_STS_NOTSUPP);
+ btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0],
+ cb->start, cb->start + cb->len - 1,
+ bio->bi_status ? BLK_STS_OK : BLK_STS_NOTSUPP);
cb->compressed_pages[0]->mapping = NULL;
end_compressed_writeback(inode, cb);
@@ -306,7 +301,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
blk_status_t ret;
int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
- WARN_ON(start & ((u64)PAGE_SIZE - 1));
+ WARN_ON(!PAGE_ALIGNED(start));
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
if (!cb)
return BLK_STS_RESOURCE;
@@ -337,7 +332,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
page = compressed_pages[pg_index];
page->mapping = inode->i_mapping;
if (bio->bi_iter.bi_size)
- submit = btrfs_merge_bio_hook(page, 0, PAGE_SIZE, bio, 0);
+ submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio,
+ 0);
page->mapping = NULL;
if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) <
@@ -481,7 +477,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (page->index == end_index) {
char *userpage;
- size_t zero_offset = isize & (PAGE_SIZE - 1);
+ size_t zero_offset = offset_in_page(isize);
if (zero_offset) {
int zeros;
@@ -615,8 +611,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
page->index = em_start >> PAGE_SHIFT;
if (comp_bio->bi_iter.bi_size)
- submit = btrfs_merge_bio_hook(page, 0, PAGE_SIZE,
- comp_bio, 0);
+ submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE,
+ comp_bio, 0);
page->mapping = NULL;
if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) <
@@ -1207,7 +1203,7 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
/*
* Shannon Entropy calculation
*
- * Pure byte distribution analysis fails to determine compressiability of data.
+ * Pure byte distribution analysis fails to determine compressibility of data.
* Try calculating entropy to estimate the average minimum number of bits
* needed to encode the sampled data.
*
@@ -1271,7 +1267,7 @@ static u8 get4bits(u64 num, int shift) {
/*
* Use 4 bits as radix base
- * Use 16 u32 counters for calculating new possition in buf array
+ * Use 16 u32 counters for calculating new position in buf array
*
* @array - array that will be sorted
* @array_buf - buffer array to store sorting results
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 539901f..d92462f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -12,6 +12,7 @@
#include "transaction.h"
#include "print-tree.h"
#include "locking.h"
+#include "volumes.h"
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int level);
@@ -224,7 +225,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
else
btrfs_set_header_owner(cow, new_root_objectid);
- write_extent_buffer_fsid(cow, fs_info->fsid);
+ write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
WARN_ON(btrfs_header_generation(buf) > trans->transid);
if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
@@ -1050,7 +1051,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
else
btrfs_set_header_owner(cow, root->root_key.objectid);
- write_extent_buffer_fsid(cow, fs_info->fsid);
+ write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
if (ret) {
@@ -1290,7 +1291,6 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
- extent_buffer_get(eb_rewin);
btrfs_tree_read_lock(eb_rewin);
__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
WARN_ON(btrfs_header_nritems(eb_rewin) >
@@ -1362,7 +1362,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
if (!eb)
return NULL;
- extent_buffer_get(eb);
btrfs_tree_read_lock(eb);
if (old_root) {
btrfs_set_header_bytenr(eb, eb->start);
@@ -1415,7 +1414,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
*
* What is forced COW:
* when we create snapshot during committing the transaction,
- * after we've finished coping src root, we must COW the shared
+ * after we've finished copying src root, we must COW the shared
* block to ensure the metadata consistency.
*/
if (btrfs_header_generation(buf) == trans->transid &&
@@ -1441,6 +1440,10 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
u64 search_start;
int ret;
+ if (test_bit(BTRFS_ROOT_DELETING, &root->state))
+ btrfs_err(fs_info,
+ "COW'ing blocks on a fs root that's being dropped");
+
if (trans->transaction != fs_info->running_transaction)
WARN(1, KERN_CRIT "trans %llu running %llu\n",
trans->transid,
@@ -2584,14 +2587,27 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
root_lock = BTRFS_READ_LOCK;
if (p->search_commit_root) {
- /* The commit roots are read only so we always do read locks */
- if (p->need_commit_sem)
+ /*
+ * The commit roots are read only so we always do read locks,
+ * and we always must hold the commit_root_sem when doing
+ * searches on them, the only exception is send where we don't
+ * want to block transaction commits for a long time, so
+ * we need to clone the commit root in order to avoid races
+ * with transaction commits that create a snapshot of one of
+ * the roots used by a send operation.
+ */
+ if (p->need_commit_sem) {
down_read(&fs_info->commit_root_sem);
- b = root->commit_root;
- extent_buffer_get(b);
- level = btrfs_header_level(b);
- if (p->need_commit_sem)
+ b = btrfs_clone_extent_buffer(root->commit_root);
up_read(&fs_info->commit_root_sem);
+ if (!b)
+ return ERR_PTR(-ENOMEM);
+
+ } else {
+ b = root->commit_root;
+ extent_buffer_get(b);
+ }
+ level = btrfs_header_level(b);
/*
* Ensure that all callers have set skip_locking when
* p->search_commit_root = 1.
@@ -2717,6 +2733,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
again:
prev_cmp = -1;
b = btrfs_search_slot_get_root(root, p, write_lock_level);
+ if (IS_ERR(b)) {
+ ret = PTR_ERR(b);
+ goto done;
+ }
while (b) {
level = btrfs_header_level(b);
@@ -3751,7 +3771,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
/* Key greater than all keys in the leaf, right neighbor has
* enough room for it and we're not emptying our leaf to delete
* it, therefore use right neighbor to insert the new item and
- * no need to touch/dirty our left leaft. */
+ * no need to touch/dirty our left leaf. */
btrfs_tree_unlock(left);
free_extent_buffer(left);
path->nodes[0] = right;
@@ -5390,7 +5410,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
ret = -ENOMEM;
goto out;
}
- extent_buffer_get(left_path->nodes[left_level]);
right_level = btrfs_header_level(right_root->commit_root);
right_root_level = right_level;
@@ -5401,7 +5420,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
ret = -ENOMEM;
goto out;
}
- extent_buffer_get(right_path->nodes[right_level]);
up_read(&fs_info->commit_root_sem);
if (left_level == 0)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 68f322f..0a68cf7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -109,13 +109,26 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
}
/*
- * File system states
+ * Runtime (in-memory) states of filesystem
*/
-#define BTRFS_FS_STATE_ERROR 0
-#define BTRFS_FS_STATE_REMOUNTING 1
-#define BTRFS_FS_STATE_TRANS_ABORTED 2
-#define BTRFS_FS_STATE_DEV_REPLACING 3
-#define BTRFS_FS_STATE_DUMMY_FS_INFO 4
+enum {
+ /* Global indicator of serious filesystem errors */
+ BTRFS_FS_STATE_ERROR,
+ /*
+ * Filesystem is being remounted, allow to skip some operations, like
+ * defrag
+ */
+ BTRFS_FS_STATE_REMOUNTING,
+ /* Track if a transaction abort has been reported on this filesystem */
+ BTRFS_FS_STATE_TRANS_ABORTED,
+ /*
+ * Bio operations should be blocked on this filesystem because a source
+ * or target device is being destroyed as part of a device replace
+ */
+ BTRFS_FS_STATE_DEV_REPLACING,
+ /* The btrfs_fs_info created for self-tests */
+ BTRFS_FS_STATE_DUMMY_FS_INFO,
+};
#define BTRFS_BACKREF_REV_MAX 256
#define BTRFS_BACKREF_REV_SHIFT 56
@@ -195,9 +208,10 @@ struct btrfs_root_backup {
* it currently lacks any block count etc etc
*/
struct btrfs_super_block {
- u8 csum[BTRFS_CSUM_SIZE];
/* the first 4 fields must match struct btrfs_header */
- u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+ u8 csum[BTRFS_CSUM_SIZE];
+ /* FS specific UUID, visible to user */
+ u8 fsid[BTRFS_FSID_SIZE];
__le64 bytenr; /* this block number */
__le64 flags;
@@ -234,8 +248,11 @@ struct btrfs_super_block {
__le64 cache_generation;
__le64 uuid_tree_generation;
+ /* the UUID written into btree blocks */
+ u8 metadata_uuid[BTRFS_FSID_SIZE];
+
/* future expansion */
- __le64 reserved[30];
+ __le64 reserved[28];
u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
} __attribute__ ((__packed__));
@@ -265,7 +282,8 @@ struct btrfs_super_block {
BTRFS_FEATURE_INCOMPAT_RAID56 | \
BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
- BTRFS_FEATURE_INCOMPAT_NO_HOLES)
+ BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID)
#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \
(BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
@@ -316,7 +334,7 @@ struct btrfs_node {
* The slots array records the index of the item or block pointer
* used while walking the tree.
*/
-enum { READA_NONE = 0, READA_BACK, READA_FORWARD };
+enum { READA_NONE, READA_BACK, READA_FORWARD };
struct btrfs_path {
struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
int slots[BTRFS_MAX_LEVEL];
@@ -360,9 +378,7 @@ struct btrfs_dev_replace {
struct btrfs_device *tgtdev;
struct mutex lock_finishing_cancel_unmount;
- rwlock_t lock;
- atomic_t blocking_readers;
- wait_queue_head_t read_lock_wq;
+ struct rw_semaphore rwsem;
struct btrfs_scrub_progress scrub_progress;
@@ -443,13 +459,19 @@ struct btrfs_space_info {
struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
};
-#define BTRFS_BLOCK_RSV_GLOBAL 1
-#define BTRFS_BLOCK_RSV_DELALLOC 2
-#define BTRFS_BLOCK_RSV_TRANS 3
-#define BTRFS_BLOCK_RSV_CHUNK 4
-#define BTRFS_BLOCK_RSV_DELOPS 5
-#define BTRFS_BLOCK_RSV_EMPTY 6
-#define BTRFS_BLOCK_RSV_TEMP 7
+/*
+ * Types of block reserves
+ */
+enum {
+ BTRFS_BLOCK_RSV_GLOBAL,
+ BTRFS_BLOCK_RSV_DELALLOC,
+ BTRFS_BLOCK_RSV_TRANS,
+ BTRFS_BLOCK_RSV_CHUNK,
+ BTRFS_BLOCK_RSV_DELOPS,
+ BTRFS_BLOCK_RSV_DELREFS,
+ BTRFS_BLOCK_RSV_EMPTY,
+ BTRFS_BLOCK_RSV_TEMP,
+};
struct btrfs_block_rsv {
u64 size;
@@ -509,18 +531,18 @@ struct btrfs_free_cluster {
};
enum btrfs_caching_type {
- BTRFS_CACHE_NO = 0,
- BTRFS_CACHE_STARTED = 1,
- BTRFS_CACHE_FAST = 2,
- BTRFS_CACHE_FINISHED = 3,
- BTRFS_CACHE_ERROR = 4,
+ BTRFS_CACHE_NO,
+ BTRFS_CACHE_STARTED,
+ BTRFS_CACHE_FAST,
+ BTRFS_CACHE_FINISHED,
+ BTRFS_CACHE_ERROR,
};
enum btrfs_disk_cache_state {
- BTRFS_DC_WRITTEN = 0,
- BTRFS_DC_ERROR = 1,
- BTRFS_DC_CLEAR = 2,
- BTRFS_DC_SETUP = 3,
+ BTRFS_DC_WRITTEN,
+ BTRFS_DC_ERROR,
+ BTRFS_DC_CLEAR,
+ BTRFS_DC_SETUP,
};
struct btrfs_caching_control {
@@ -712,41 +734,61 @@ struct btrfs_fs_devices;
struct btrfs_balance_control;
struct btrfs_delayed_root;
-#define BTRFS_FS_BARRIER 1
-#define BTRFS_FS_CLOSING_START 2
-#define BTRFS_FS_CLOSING_DONE 3
-#define BTRFS_FS_LOG_RECOVERING 4
-#define BTRFS_FS_OPEN 5
-#define BTRFS_FS_QUOTA_ENABLED 6
-#define BTRFS_FS_UPDATE_UUID_TREE_GEN 9
-#define BTRFS_FS_CREATING_FREE_SPACE_TREE 10
-#define BTRFS_FS_BTREE_ERR 11
-#define BTRFS_FS_LOG1_ERR 12
-#define BTRFS_FS_LOG2_ERR 13
-#define BTRFS_FS_QUOTA_OVERRIDE 14
-/* Used to record internally whether fs has been frozen */
-#define BTRFS_FS_FROZEN 15
-
-/*
- * Indicate that a whole-filesystem exclusive operation is running
- * (device replace, resize, device add/delete, balance)
- */
-#define BTRFS_FS_EXCL_OP 16
-
/*
- * To info transaction_kthread we need an immediate commit so it doesn't
- * need to wait for commit_interval
+ * Block group or device which contains an active swapfile. Used for preventing
+ * unsafe operations while a swapfile is active.
+ *
+ * These are sorted on (ptr, inode) (note that a block group or device can
+ * contain more than one swapfile). We compare the pointer values because we
+ * don't actually care what the object is, we just need a quick check whether
+ * the object exists in the rbtree.
*/
-#define BTRFS_FS_NEED_ASYNC_COMMIT 17
+struct btrfs_swapfile_pin {
+ struct rb_node node;
+ void *ptr;
+ struct inode *inode;
+ /*
+ * If true, ptr points to a struct btrfs_block_group_cache. Otherwise,
+ * ptr points to a struct btrfs_device.
+ */
+ bool is_block_group;
+};
-/*
- * Indicate that balance has been set up from the ioctl and is in the main
- * phase. The fs_info::balance_ctl is initialized.
- */
-#define BTRFS_FS_BALANCE_RUNNING 18
+bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
+
+enum {
+ BTRFS_FS_BARRIER,
+ BTRFS_FS_CLOSING_START,
+ BTRFS_FS_CLOSING_DONE,
+ BTRFS_FS_LOG_RECOVERING,
+ BTRFS_FS_OPEN,
+ BTRFS_FS_QUOTA_ENABLED,
+ BTRFS_FS_UPDATE_UUID_TREE_GEN,
+ BTRFS_FS_CREATING_FREE_SPACE_TREE,
+ BTRFS_FS_BTREE_ERR,
+ BTRFS_FS_LOG1_ERR,
+ BTRFS_FS_LOG2_ERR,
+ BTRFS_FS_QUOTA_OVERRIDE,
+ /* Used to record internally whether fs has been frozen */
+ BTRFS_FS_FROZEN,
+ /*
+ * Indicate that a whole-filesystem exclusive operation is running
+ * (device replace, resize, device add/delete, balance)
+ */
+ BTRFS_FS_EXCL_OP,
+ /*
+ * To info transaction_kthread we need an immediate commit so it
+ * doesn't need to wait for commit_interval
+ */
+ BTRFS_FS_NEED_ASYNC_COMMIT,
+ /*
+ * Indicate that balance has been set up from the ioctl and is in the
+ * main phase. The fs_info::balance_ctl is initialized.
+ */
+ BTRFS_FS_BALANCE_RUNNING,
+};
struct btrfs_fs_info {
- u8 fsid[BTRFS_FSID_SIZE];
u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
unsigned long flags;
struct btrfs_root *extent_root;
@@ -790,6 +832,8 @@ struct btrfs_fs_info {
struct btrfs_block_rsv chunk_block_rsv;
/* block reservation for delayed operations */
struct btrfs_block_rsv delayed_block_rsv;
+ /* block reservation for delayed refs */
+ struct btrfs_block_rsv delayed_refs_rsv;
struct btrfs_block_rsv empty_block_rsv;
@@ -1100,9 +1144,6 @@ struct btrfs_fs_info {
struct mutex unused_bg_unpin_mutex;
struct mutex delete_unused_bgs_mutex;
- /* For btrfs to record security options */
- struct security_mnt_opts security_opts;
-
/*
* Chunks that can't be freed yet (under a trim/discard operation)
* and will be latter freed. Protected by fs_info->chunk_mutex.
@@ -1114,6 +1155,10 @@ struct btrfs_fs_info {
u32 sectorsize;
u32 stripesize;
+ /* Block groups and devices containing active swapfiles. */
+ spinlock_t swapfile_pins_lock;
+ struct rb_root swapfile_pins;
+
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
@@ -1133,22 +1178,24 @@ struct btrfs_subvolume_writers {
/*
* The state of btrfs root
*/
-/*
- * btrfs_record_root_in_trans is a multi-step process,
- * and it can race with the balancing code. But the
- * race is very small, and only the first time the root
- * is added to each transaction. So IN_TRANS_SETUP
- * is used to tell us when more checks are required
- */
-#define BTRFS_ROOT_IN_TRANS_SETUP 0
-#define BTRFS_ROOT_REF_COWS 1
-#define BTRFS_ROOT_TRACK_DIRTY 2
-#define BTRFS_ROOT_IN_RADIX 3
-#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 4
-#define BTRFS_ROOT_DEFRAG_RUNNING 5
-#define BTRFS_ROOT_FORCE_COW 6
-#define BTRFS_ROOT_MULTI_LOG_TASKS 7
-#define BTRFS_ROOT_DIRTY 8
+enum {
+ /*
+ * btrfs_record_root_in_trans is a multi-step process, and it can race
+ * with the balancing code. But the race is very small, and only the
+ * first time the root is added to each transaction. So IN_TRANS_SETUP
+ * is used to tell us when more checks are required
+ */
+ BTRFS_ROOT_IN_TRANS_SETUP,
+ BTRFS_ROOT_REF_COWS,
+ BTRFS_ROOT_TRACK_DIRTY,
+ BTRFS_ROOT_IN_RADIX,
+ BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
+ BTRFS_ROOT_DEFRAG_RUNNING,
+ BTRFS_ROOT_FORCE_COW,
+ BTRFS_ROOT_MULTI_LOG_TASKS,
+ BTRFS_ROOT_DIRTY,
+ BTRFS_ROOT_DELETING,
+};
/*
* in ram representation of the tree. extent_root is used for all allocations
@@ -1274,6 +1321,9 @@ struct btrfs_root {
u64 qgroup_meta_rsv_pertrans;
u64 qgroup_meta_rsv_prealloc;
+ /* Number of active swapfiles */
+ atomic_t nr_swapfiles;
+
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
u64 alloc_bytenr;
#endif
@@ -2570,10 +2620,10 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
/* extent-tree.c */
enum btrfs_inline_ref_type {
- BTRFS_REF_TYPE_INVALID = 0,
- BTRFS_REF_TYPE_BLOCK = 1,
- BTRFS_REF_TYPE_DATA = 2,
- BTRFS_REF_TYPE_ANY = 3,
+ BTRFS_REF_TYPE_INVALID,
+ BTRFS_REF_TYPE_BLOCK,
+ BTRFS_REF_TYPE_DATA,
+ BTRFS_REF_TYPE_ANY,
};
int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
@@ -2599,7 +2649,7 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info,
}
int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans);
-int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans);
+bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
const u64 start);
void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
@@ -2713,10 +2763,12 @@ enum btrfs_reserve_flush_enum {
enum btrfs_flush_state {
FLUSH_DELAYED_ITEMS_NR = 1,
FLUSH_DELAYED_ITEMS = 2,
- FLUSH_DELALLOC = 3,
- FLUSH_DELALLOC_WAIT = 4,
- ALLOC_CHUNK = 5,
- COMMIT_TRANS = 6,
+ FLUSH_DELAYED_REFS_NR = 3,
+ FLUSH_DELAYED_REFS = 4,
+ FLUSH_DELALLOC = 5,
+ FLUSH_DELALLOC_WAIT = 6,
+ ALLOC_CHUNK = 7,
+ COMMIT_TRANS = 8,
};
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
@@ -2767,6 +2819,13 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
+void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr);
+void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans);
+int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
+ enum btrfs_reserve_flush_enum flush);
+void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *src,
+ u64 num_bytes);
int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache);
void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
@@ -2959,7 +3018,6 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
kfree(fs_info->free_space_root);
kfree(fs_info->super_copy);
kfree(fs_info->super_for_commit);
- security_free_mnt_opts(&fs_info->security_opts);
kvfree(fs_info);
}
@@ -3141,7 +3199,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct inode *inode, u64 new_size,
u32 min_type);
-int btrfs_start_delalloc_inodes(struct btrfs_root *root);
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
unsigned int extra_bits,
@@ -3150,9 +3208,16 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
struct btrfs_root *new_root,
struct btrfs_root *parent_root,
u64 new_dirid);
-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
- size_t size, struct bio *bio,
- unsigned long bio_flags);
+ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
+ unsigned *bits);
+void btrfs_clear_delalloc_extent(struct inode *inode,
+ struct extent_state *state, unsigned *bits);
+void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
+ struct extent_state *other);
+void btrfs_split_delalloc_extent(struct inode *inode,
+ struct extent_state *orig, u64 split);
+int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
+ unsigned long bio_flags);
void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end);
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
int btrfs_readpage(struct file *file, struct page *page);
@@ -3189,6 +3254,12 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
struct btrfs_trans_handle *trans, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
+int btrfs_run_delalloc_range(void *private_data, struct page *locked_page,
+ u64 start, u64 end, int *page_started, unsigned long *nr_written,
+ struct writeback_control *wbc);
+int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
+void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
+ u64 end, int uptodate);
extern const struct dentry_operations btrfs_dentry_operations;
/* ioctl.c */
@@ -3428,6 +3499,16 @@ static inline void assfail(const char *expr, const char *file, int line)
#define ASSERT(expr) ((void)0)
#endif
+/*
+ * Use that for functions that are conditionally exported for sanity tests but
+ * otherwise static
+ */
+#ifndef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+#define EXPORT_FOR_TESTS static
+#else
+#define EXPORT_FOR_TESTS
+#endif
+
__cold
static inline void btrfs_print_v0_err(struct btrfs_fs_info *fs_info)
{
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 9301b3a..cad36c9 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -251,8 +251,6 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
ref->in_tree = 0;
btrfs_put_delayed_ref(ref);
atomic_dec(&delayed_refs->num_entries);
- if (trans->delayed_ref_updates)
- trans->delayed_ref_updates--;
}
static bool merge_ref(struct btrfs_trans_handle *trans,
@@ -400,6 +398,20 @@ again:
return head;
}
+void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head)
+{
+ lockdep_assert_held(&delayed_refs->lock);
+ lockdep_assert_held(&head->lock);
+
+ rb_erase_cached(&head->href_node, &delayed_refs->href_root);
+ RB_CLEAR_NODE(&head->href_node);
+ atomic_dec(&delayed_refs->num_entries);
+ delayed_refs->num_heads--;
+ if (head->processing == 0)
+ delayed_refs->num_heads_ready--;
+}
+
/*
* Helper to insert the ref_node to the tail or merge with tail.
*
@@ -453,7 +465,6 @@ inserted:
if (ref->action == BTRFS_ADD_DELAYED_REF)
list_add_tail(&ref->add_list, &href->ref_add_list);
atomic_inc(&root->num_entries);
- trans->delayed_ref_updates++;
spin_unlock(&href->lock);
return ret;
}
@@ -462,12 +473,14 @@ inserted:
* helper function to update the accounting in the head ref
* existing and update must have the same bytenr
*/
-static noinline void
-update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
+static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *existing,
struct btrfs_delayed_ref_head *update,
int *old_ref_mod_ret)
{
+ struct btrfs_delayed_ref_root *delayed_refs =
+ &trans->transaction->delayed_refs;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
int old_ref_mod;
BUG_ON(existing->is_data != update->is_data);
@@ -525,10 +538,18 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
* versa we need to make sure to adjust pending_csums accordingly.
*/
if (existing->is_data) {
- if (existing->total_ref_mod >= 0 && old_ref_mod < 0)
+ u64 csum_leaves =
+ btrfs_csum_bytes_to_leaves(fs_info,
+ existing->num_bytes);
+
+ if (existing->total_ref_mod >= 0 && old_ref_mod < 0) {
delayed_refs->pending_csums -= existing->num_bytes;
- if (existing->total_ref_mod < 0 && old_ref_mod >= 0)
+ btrfs_delayed_refs_rsv_release(fs_info, csum_leaves);
+ }
+ if (existing->total_ref_mod < 0 && old_ref_mod >= 0) {
delayed_refs->pending_csums += existing->num_bytes;
+ trans->delayed_ref_updates += csum_leaves;
+ }
}
spin_unlock(&existing->lock);
}
@@ -634,7 +655,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
&& head_ref->qgroup_reserved
&& existing->qgroup_ref_root
&& existing->qgroup_reserved);
- update_existing_head_ref(delayed_refs, existing, head_ref,
+ update_existing_head_ref(trans, existing, head_ref,
old_ref_mod);
/*
* we've updated the existing ref, free the newly
@@ -645,8 +666,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
} else {
if (old_ref_mod)
*old_ref_mod = 0;
- if (head_ref->is_data && head_ref->ref_mod < 0)
+ if (head_ref->is_data && head_ref->ref_mod < 0) {
delayed_refs->pending_csums += head_ref->num_bytes;
+ trans->delayed_ref_updates +=
+ btrfs_csum_bytes_to_leaves(trans->fs_info,
+ head_ref->num_bytes);
+ }
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
atomic_inc(&delayed_refs->num_entries);
@@ -782,6 +807,12 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
+ /*
+ * Need to update the delayed_refs_rsv with any changes we may have
+ * made.
+ */
+ btrfs_update_delayed_refs_rsv(trans);
+
trace_add_delayed_tree_ref(fs_info, &ref->node, ref,
action == BTRFS_ADD_DELAYED_EXTENT ?
BTRFS_ADD_DELAYED_REF : action);
@@ -863,6 +894,12 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
+ /*
+ * Need to update the delayed_refs_rsv with any changes we may have
+ * made.
+ */
+ btrfs_update_delayed_refs_rsv(trans);
+
trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref,
action == BTRFS_ADD_DELAYED_EXTENT ?
BTRFS_ADD_DELAYED_REF : action);
@@ -899,6 +936,12 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
NULL, NULL, NULL);
spin_unlock(&delayed_refs->lock);
+
+ /*
+ * Need to update the delayed_refs_rsv with any changes we may have
+ * made.
+ */
+ btrfs_update_delayed_refs_rsv(trans);
return 0;
}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 8e20c5c..d2af974 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -261,7 +261,8 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
{
mutex_unlock(&head->mutex);
}
-
+void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head);
struct btrfs_delayed_ref_head *btrfs_select_ref_head(
struct btrfs_delayed_ref_root *delayed_refs);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 2aa48ae..8750c83 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -59,7 +59,6 @@ no_valid_dev_replace_entry_found:
BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
dev_replace->cont_reading_from_srcdev_mode =
BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
- dev_replace->replace_state = 0;
dev_replace->time_started = 0;
dev_replace->time_stopped = 0;
atomic64_set(&dev_replace->num_write_errors, 0);
@@ -285,13 +284,13 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
struct btrfs_dev_replace_item *ptr;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
- btrfs_dev_replace_read_lock(dev_replace);
+ down_read(&dev_replace->rwsem);
if (!dev_replace->is_valid ||
!dev_replace->item_needs_writeback) {
- btrfs_dev_replace_read_unlock(dev_replace);
+ up_read(&dev_replace->rwsem);
return 0;
}
- btrfs_dev_replace_read_unlock(dev_replace);
+ up_read(&dev_replace->rwsem);
key.objectid = 0;
key.type = BTRFS_DEV_REPLACE_KEY;
@@ -349,7 +348,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
ptr = btrfs_item_ptr(eb, path->slots[0],
struct btrfs_dev_replace_item);
- btrfs_dev_replace_write_lock(dev_replace);
+ down_write(&dev_replace->rwsem);
if (dev_replace->srcdev)
btrfs_set_dev_replace_src_devid(eb, ptr,
dev_replace->srcdev->devid);
@@ -372,7 +371,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
btrfs_set_dev_replace_cursor_right(eb, ptr,
dev_replace->cursor_right);
dev_replace->item_needs_writeback = 0;
- btrfs_dev_replace_write_unlock(dev_replace);
+ up_write(&dev_replace->rwsem);
btrfs_mark_buffer_dirty(eb);
@@ -390,7 +389,7 @@ static char* btrfs_dev_name(struct btrfs_device *device)
return rcu_str_deref(device->name);
}
-int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
+static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
int read_src)
{
@@ -407,6 +406,13 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
if (IS_ERR(src_device))
return PTR_ERR(src_device);
+ if (btrfs_pinned_by_swapfile(fs_info, src_device)) {
+ btrfs_warn_in_rcu(fs_info,
+ "cannot replace device %s (devid %llu) due to active swapfile",
+ btrfs_dev_name(src_device), src_device->devid);
+ return -ETXTBSY;
+ }
+
ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
src_device, &tgt_device);
if (ret)
@@ -426,7 +432,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
}
need_unlock = true;
- btrfs_dev_replace_write_lock(dev_replace);
+ down_write(&dev_replace->rwsem);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -464,7 +470,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
dev_replace->item_needs_writeback = 1;
atomic64_set(&dev_replace->num_write_errors, 0);
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
- btrfs_dev_replace_write_unlock(dev_replace);
+ up_write(&dev_replace->rwsem);
need_unlock = false;
ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
@@ -478,7 +484,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
need_unlock = true;
- btrfs_dev_replace_write_lock(dev_replace);
+ down_write(&dev_replace->rwsem);
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
dev_replace->srcdev = NULL;
@@ -497,7 +503,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
ret = btrfs_dev_replace_finishing(fs_info, ret);
if (ret == -EINPROGRESS) {
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
- } else {
+ } else if (ret != -ECANCELED) {
WARN_ON(ret);
}
@@ -505,7 +511,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
leave:
if (need_unlock)
- btrfs_dev_replace_write_unlock(dev_replace);
+ up_write(&dev_replace->rwsem);
btrfs_destroy_dev_replace_tgtdev(tgt_device);
return ret;
}
@@ -533,8 +539,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
args->start.cont_reading_from_srcdev_mode);
args->result = ret;
/* don't warn if EINPROGRESS, someone else might be running scrub */
- if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS)
- ret = 0;
+ if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS ||
+ ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR)
+ return 0;
return ret;
}
@@ -572,18 +579,18 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
/* don't allow cancel or unmount to disturb the finishing procedure */
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
- btrfs_dev_replace_read_lock(dev_replace);
+ down_read(&dev_replace->rwsem);
/* was the operation canceled, or is it finished? */
if (dev_replace->replace_state !=
BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
- btrfs_dev_replace_read_unlock(dev_replace);
+ up_read(&dev_replace->rwsem);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return 0;
}
tgt_device = dev_replace->tgtdev;
src_device = dev_replace->srcdev;
- btrfs_dev_replace_read_unlock(dev_replace);
+ up_read(&dev_replace->rwsem);
/*
* flush all outstanding I/O and inode extent mappings before the
@@ -607,7 +614,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
/* keep away write_all_supers() during the finishing procedure */
mutex_lock(&fs_info->fs_devices->device_list_mutex);
mutex_lock(&fs_info->chunk_mutex);
- btrfs_dev_replace_write_lock(dev_replace);
+ down_write(&dev_replace->rwsem);
dev_replace->replace_state =
scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
: BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
@@ -622,12 +629,13 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
src_device,
tgt_device);
} else {
- btrfs_err_in_rcu(fs_info,
+ if (scrub_ret != -ECANCELED)
+ btrfs_err_in_rcu(fs_info,
"btrfs_scrub_dev(%s, %llu, %s) failed %d",
btrfs_dev_name(src_device),
src_device->devid,
rcu_str_deref(tgt_device->name), scrub_ret);
- btrfs_dev_replace_write_unlock(dev_replace);
+ up_write(&dev_replace->rwsem);
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
btrfs_rm_dev_replace_blocked(fs_info);
@@ -663,8 +671,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
fs_info->fs_devices->rw_devices++;
- btrfs_dev_replace_write_unlock(dev_replace);
-
+ up_write(&dev_replace->rwsem);
btrfs_rm_dev_replace_blocked(fs_info);
btrfs_rm_dev_replace_remove_srcdev(src_device);
@@ -761,7 +768,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
- btrfs_dev_replace_read_lock(dev_replace);
+ down_read(&dev_replace->rwsem);
/* even if !dev_replace_is_valid, the values are good enough for
* the replace_status ioctl */
args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
@@ -773,7 +780,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
args->status.num_uncorrectable_read_errors =
atomic64_read(&dev_replace->num_uncorrectable_read_errors);
args->status.progress_1000 = btrfs_dev_replace_progress(fs_info);
- btrfs_dev_replace_read_unlock(dev_replace);
+ up_read(&dev_replace->rwsem);
}
int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
@@ -790,46 +797,74 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
return -EROFS;
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
- btrfs_dev_replace_write_lock(dev_replace);
+ down_write(&dev_replace->rwsem);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
- btrfs_dev_replace_write_unlock(dev_replace);
- goto leave;
+ up_write(&dev_replace->rwsem);
+ break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ tgt_device = dev_replace->tgtdev;
+ src_device = dev_replace->srcdev;
+ up_write(&dev_replace->rwsem);
+ ret = btrfs_scrub_cancel(fs_info);
+ if (ret < 0) {
+ result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
+ } else {
+ result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+ /*
+ * btrfs_dev_replace_finishing() will handle the
+ * cleanup part
+ */
+ btrfs_info_in_rcu(fs_info,
+ "dev_replace from %s (devid %llu) to %s canceled",
+ btrfs_dev_name(src_device), src_device->devid,
+ btrfs_dev_name(tgt_device));
+ }
+ break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ /*
+ * Scrub doing the replace isn't running so we need to do the
+ * cleanup step of btrfs_dev_replace_finishing() here
+ */
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
tgt_device = dev_replace->tgtdev;
src_device = dev_replace->srcdev;
dev_replace->tgtdev = NULL;
dev_replace->srcdev = NULL;
- break;
- }
- dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
- dev_replace->time_stopped = ktime_get_real_seconds();
- dev_replace->item_needs_writeback = 1;
- btrfs_dev_replace_write_unlock(dev_replace);
- btrfs_scrub_cancel(fs_info);
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
+ dev_replace->time_stopped = ktime_get_real_seconds();
+ dev_replace->item_needs_writeback = 1;
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
- return PTR_ERR(trans);
- }
- ret = btrfs_commit_transaction(trans);
- WARN_ON(ret);
+ up_write(&dev_replace->rwsem);
- btrfs_info_in_rcu(fs_info,
- "dev_replace from %s (devid %llu) to %s canceled",
- btrfs_dev_name(src_device), src_device->devid,
- btrfs_dev_name(tgt_device));
+ /* Scrub for replace must not be running in suspended state */
+ ret = btrfs_scrub_cancel(fs_info);
+ ASSERT(ret != -ENOTCONN);
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return PTR_ERR(trans);
+ }
+ ret = btrfs_commit_transaction(trans);
+ WARN_ON(ret);
- if (tgt_device)
- btrfs_destroy_dev_replace_tgtdev(tgt_device);
+ btrfs_info_in_rcu(fs_info,
+ "suspended dev_replace from %s (devid %llu) to %s canceled",
+ btrfs_dev_name(src_device), src_device->devid,
+ btrfs_dev_name(tgt_device));
+
+ if (tgt_device)
+ btrfs_destroy_dev_replace_tgtdev(tgt_device);
+ break;
+ default:
+ result = -EINVAL;
+ }
-leave:
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return result;
}
@@ -839,7 +874,8 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
- btrfs_dev_replace_write_lock(dev_replace);
+ down_write(&dev_replace->rwsem);
+
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -855,7 +891,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
break;
}
- btrfs_dev_replace_write_unlock(dev_replace);
+ up_write(&dev_replace->rwsem);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
}
@@ -865,12 +901,13 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
struct task_struct *task;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
- btrfs_dev_replace_write_lock(dev_replace);
+ down_write(&dev_replace->rwsem);
+
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
- btrfs_dev_replace_write_unlock(dev_replace);
+ up_write(&dev_replace->rwsem);
return 0;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
break;
@@ -884,10 +921,12 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
"cannot continue dev_replace, tgtdev is missing");
btrfs_info(fs_info,
"you may cancel the operation after 'mount -o degraded'");
- btrfs_dev_replace_write_unlock(dev_replace);
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+ up_write(&dev_replace->rwsem);
return 0;
}
- btrfs_dev_replace_write_unlock(dev_replace);
+ up_write(&dev_replace->rwsem);
/*
* This could collide with a paused balance, but the exclusive op logic
@@ -895,6 +934,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
* dev-replace to start anyway.
*/
if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ down_write(&dev_replace->rwsem);
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+ up_write(&dev_replace->rwsem);
btrfs_info(fs_info,
"cannot resume dev-replace, other exclusive operation running");
return 0;
@@ -925,7 +968,7 @@ static int btrfs_dev_replace_kthread(void *data)
btrfs_device_get_total_bytes(dev_replace->srcdev),
&dev_replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(fs_info, ret);
- WARN_ON(ret);
+ WARN_ON(ret && ret != -ECANCELED);
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
return 0;
@@ -948,7 +991,7 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
* something that can happen if the dev_replace
* procedure is suspended by an umount and then
* the tgtdev is missing (or "btrfs dev scan") was
- * not called and the the filesystem is remounted
+ * not called and the filesystem is remounted
* in degraded state. This does not stop the
* dev_replace procedure. It needs to be canceled
* manually if the cancellation is wanted.
@@ -958,42 +1001,6 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
return 1;
}
-void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace)
-{
- read_lock(&dev_replace->lock);
-}
-
-void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace)
-{
- read_unlock(&dev_replace->lock);
-}
-
-void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace)
-{
-again:
- wait_event(dev_replace->read_lock_wq,
- atomic_read(&dev_replace->blocking_readers) == 0);
- write_lock(&dev_replace->lock);
- if (atomic_read(&dev_replace->blocking_readers)) {
- write_unlock(&dev_replace->lock);
- goto again;
- }
-}
-
-void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace)
-{
- write_unlock(&dev_replace->lock);
-}
-
-/* inc blocking cnt and release read lock */
-void btrfs_dev_replace_set_lock_blocking(
- struct btrfs_dev_replace *dev_replace)
-{
- /* only set blocking for read lock */
- atomic_inc(&dev_replace->blocking_readers);
- read_unlock(&dev_replace->lock);
-}
-
void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
{
percpu_counter_inc(&fs_info->dev_replace.bio_counter);
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 795c551..4aa40ba 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -13,19 +13,11 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args);
-int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
- const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
- int read_src);
void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args);
int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace);
#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3f0b6d1..8da2f38 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -279,6 +279,12 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
len = buf->len - offset;
while (len > 0) {
+ /*
+ * Note: we don't need to check for the err == 1 case here, as
+ * with the given combination of 'start = BTRFS_CSUM_SIZE (32)'
+ * and 'min_len = 32' and the currently implemented mapping
+ * algorithm we cannot cross a page boundary.
+ */
err = map_private_extent_buffer(buf, offset, 32,
&kaddr, &map_start, &map_len);
if (err)
@@ -477,9 +483,9 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
int mirror_num = 0;
int failed_mirror = 0;
- clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
while (1) {
+ clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE,
mirror_num);
if (!ret) {
@@ -493,15 +499,6 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
break;
}
- /*
- * This buffer's crc is fine, but its contents are corrupted, so
- * there is no reason to read the other copies, they won't be
- * any less wrong.
- */
- if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags) ||
- ret == -EUCLEAN)
- break;
-
num_copies = btrfs_num_copies(fs_info,
eb->start, eb->len);
if (num_copies == 1)
@@ -551,7 +548,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
if (WARN_ON(!PageUptodate(page)))
return -EUCLEAN;
- ASSERT(memcmp_extent_buffer(eb, fs_info->fsid,
+ ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0);
return csum_tree_block(fs_info, eb, 0);
@@ -566,7 +563,20 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE);
while (fs_devices) {
- if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
+ u8 *metadata_uuid;
+
+ /*
+ * Checking the incompat flag is only valid for the current
+ * fs. For seed devices it's forbidden to have their uuid
+ * changed so reading ->fsid in this case is fine
+ */
+ if (fs_devices == fs_info->fs_devices &&
+ btrfs_fs_incompat(fs_info, METADATA_UUID))
+ metadata_uuid = fs_devices->metadata_uuid;
+ else
+ metadata_uuid = fs_devices->fsid;
+
+ if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) {
ret = 0;
break;
}
@@ -669,19 +679,6 @@ out:
return ret;
}
-static int btree_io_failed_hook(struct page *page, int failed_mirror)
-{
- struct extent_buffer *eb;
-
- eb = (struct extent_buffer *)page->private;
- set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
- eb->read_mirror = failed_mirror;
- atomic_dec(&eb->io_pages);
- if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
- btree_readahead_hook(eb, -EIO);
- return -EIO; /* we fixed nothing */
-}
-
static void end_workqueue_bio(struct bio *bio)
{
struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
@@ -760,11 +757,22 @@ static void run_one_async_start(struct btrfs_work *work)
async->status = ret;
}
+/*
+ * In order to insert checksums into the metadata in large chunks, we wait
+ * until bio submission time. All the pages in the bio are checksummed and
+ * sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the csums attached on the ordered extent record are
+ * inserted into the tree.
+ */
static void run_one_async_done(struct btrfs_work *work)
{
struct async_submit_bio *async;
+ struct inode *inode;
+ blk_status_t ret;
async = container_of(work, struct async_submit_bio, work);
+ inode = async->private_data;
/* If an error occurred we just want to clean up the bio and move on */
if (async->status) {
@@ -773,7 +781,12 @@ static void run_one_async_done(struct btrfs_work *work)
return;
}
- btrfs_submit_bio_done(async->private_data, async->bio, async->mirror_num);
+ ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio,
+ async->mirror_num, 1);
+ if (ret) {
+ async->bio->bi_status = ret;
+ bio_endio(async->bio);
+ }
}
static void run_one_async_free(struct btrfs_work *work)
@@ -1187,6 +1200,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
refcount_set(&root->refs, 1);
atomic_set(&root->will_be_snapshotted, 0);
atomic_set(&root->snapshot_force_cow, 0);
+ atomic_set(&root->nr_swapfiles, 0);
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
@@ -2127,10 +2141,8 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
{
mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
- rwlock_init(&fs_info->dev_replace.lock);
- atomic_set(&fs_info->dev_replace.blocking_readers, 0);
+ init_rwsem(&fs_info->dev_replace.rwsem);
init_waitqueue_head(&fs_info->dev_replace.replace_wait);
- init_waitqueue_head(&fs_info->dev_replace.read_lock_wq);
}
static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
@@ -2451,10 +2463,11 @@ static int validate_super(struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
- if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) {
+ if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
+ BTRFS_FSID_SIZE) != 0) {
btrfs_err(fs_info,
- "dev_item UUID does not match fsid: %pU != %pU",
- fs_info->fsid, sb->dev_item.fsid);
+ "dev_item UUID does not match metadata fsid: %pU != %pU",
+ fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
ret = -EINVAL;
}
@@ -2665,6 +2678,9 @@ int open_ctree(struct super_block *sb,
btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
BTRFS_BLOCK_RSV_DELOPS);
+ btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
+ BTRFS_BLOCK_RSV_DELREFS);
+
atomic_set(&fs_info->async_delalloc_pages, 0);
atomic_set(&fs_info->defrag_running, 0);
atomic_set(&fs_info->qgroup_op_seq, 0);
@@ -2754,6 +2770,9 @@ int open_ctree(struct super_block *sb,
fs_info->sectorsize = 4096;
fs_info->stripesize = 4096;
+ spin_lock_init(&fs_info->swapfile_pins_lock);
+ fs_info->swapfile_pins = RB_ROOT;
+
ret = btrfs_alloc_stripe_hash_table(fs_info);
if (ret) {
err = ret;
@@ -2790,11 +2809,29 @@ int open_ctree(struct super_block *sb,
* the whole block of INFO_SIZE
*/
memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
- memcpy(fs_info->super_for_commit, fs_info->super_copy,
- sizeof(*fs_info->super_for_commit));
brelse(bh);
- memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
+ disk_super = fs_info->super_copy;
+
+ ASSERT(!memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
+ BTRFS_FSID_SIZE));
+
+ if (btrfs_fs_incompat(fs_info, METADATA_UUID)) {
+ ASSERT(!memcmp(fs_info->fs_devices->metadata_uuid,
+ fs_info->super_copy->metadata_uuid,
+ BTRFS_FSID_SIZE));
+ }
+
+ features = btrfs_super_flags(disk_super);
+ if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
+ features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2;
+ btrfs_set_super_flags(disk_super, features);
+ btrfs_info(fs_info,
+ "found metadata UUID change in progress flag, clearing");
+ }
+
+ memcpy(fs_info->super_for_commit, fs_info->super_copy,
+ sizeof(*fs_info->super_for_commit));
ret = btrfs_validate_mount_super(fs_info);
if (ret) {
@@ -2803,7 +2840,6 @@ int open_ctree(struct super_block *sb,
goto fail_alloc;
}
- disk_super = fs_info->super_copy;
if (!btrfs_super_root(disk_super))
goto fail_alloc;
@@ -2915,7 +2951,7 @@ int open_ctree(struct super_block *sb,
sb->s_blocksize = sectorsize;
sb->s_blocksize_bits = blksize_bits(sectorsize);
- memcpy(&sb->s_uuid, fs_info->fsid, BTRFS_FSID_SIZE);
+ memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
mutex_lock(&fs_info->chunk_mutex);
ret = btrfs_read_sys_array(fs_info);
@@ -3064,7 +3100,7 @@ retry_root_backup:
if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {
btrfs_warn(fs_info,
- "writeable mount is not allowed due to too many missing devices");
+ "writable mount is not allowed due to too many missing devices");
goto fail_sysfs;
}
@@ -3733,7 +3769,8 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
btrfs_set_stack_device_io_width(dev_item, dev->io_width);
btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
- memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_FSID_SIZE);
+ memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid,
+ BTRFS_FSID_SIZE);
flags = btrfs_super_flags(sb);
btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
@@ -4040,7 +4077,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
/*
* This is a fast path so only do this check if we have sanity tests
- * enabled. Normal people shouldn't be using umapped buffers as dirty
+ * enabled. Normal people shouldn't be using unmapped buffers as dirty
* outside of the sanity tests.
*/
if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
@@ -4338,6 +4375,8 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
unpin = pinned_extents;
again:
while (1) {
+ struct extent_state *cached_state = NULL;
+
/*
* The btrfs_finish_extent_commit() may get the same range as
* ours between find_first_extent_bit and clear_extent_dirty.
@@ -4346,13 +4385,14 @@ again:
*/
mutex_lock(&fs_info->unused_bg_unpin_mutex);
ret = find_first_extent_bit(unpin, 0, &start, &end,
- EXTENT_DIRTY, NULL);
+ EXTENT_DIRTY, &cached_state);
if (ret) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
break;
}
- clear_extent_dirty(unpin, start, end);
+ clear_extent_dirty(unpin, start, end, &cached_state);
+ free_extent_state(cached_state);
btrfs_error_unpin_extent_range(fs_info, start, end);
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
cond_resched();
@@ -4409,6 +4449,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
spin_unlock(&cur_trans->dirty_bgs_lock);
btrfs_put_block_group(cache);
+ btrfs_delayed_refs_rsv_release(fs_info, 1);
spin_lock(&cur_trans->dirty_bgs_lock);
}
spin_unlock(&cur_trans->dirty_bgs_lock);
@@ -4514,7 +4555,4 @@ static const struct extent_io_ops btree_extent_io_ops = {
/* mandatory callbacks */
.submit_bio_hook = btree_submit_bio_hook,
.readpage_end_io_hook = btree_readpage_end_io_hook,
- .readpage_io_failed_hook = btree_io_failed_hook,
-
- /* optional callbacks */
};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 4cccba2..987a64b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -21,11 +21,11 @@
#define BTRFS_BDEV_BLOCKSIZE (4096)
enum btrfs_wq_endio_type {
- BTRFS_WQ_ENDIO_DATA = 0,
- BTRFS_WQ_ENDIO_METADATA = 1,
- BTRFS_WQ_ENDIO_FREE_SPACE = 2,
- BTRFS_WQ_ENDIO_RAID56 = 3,
- BTRFS_WQ_ENDIO_DIO_REPAIR = 4,
+ BTRFS_WQ_ENDIO_DATA,
+ BTRFS_WQ_ENDIO_METADATA,
+ BTRFS_WQ_ENDIO_FREE_SPACE,
+ BTRFS_WQ_ENDIO_RAID56,
+ BTRFS_WQ_ENDIO_DIO_REPAIR,
};
static inline u64 btrfs_sb_offset(int mirror)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a1febf1..b15afea 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -51,6 +51,24 @@ enum {
CHUNK_ALLOC_FORCE = 2,
};
+/*
+ * Declare a helper function to detect underflow of various space info members
+ */
+#define DECLARE_SPACE_INFO_UPDATE(name) \
+static inline void update_##name(struct btrfs_space_info *sinfo, \
+ s64 bytes) \
+{ \
+ if (bytes < 0 && sinfo->name < -bytes) { \
+ WARN_ON(1); \
+ sinfo->name = 0; \
+ return; \
+ } \
+ sinfo->name += bytes; \
+}
+
+DECLARE_SPACE_INFO_UPDATE(bytes_may_use);
+DECLARE_SPACE_INFO_UPDATE(bytes_pinned);
+
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node, u64 parent,
u64 root_objectid, u64 owner_objectid,
@@ -1037,7 +1055,7 @@ out_free:
/*
* is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
- * is_data == BTRFS_REF_TYPE_DATA, data type is requried,
+ * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
* is_data == BTRFS_REF_TYPE_ANY, either type is OK.
*/
int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
@@ -2406,25 +2424,82 @@ static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_ref
btrfs_delayed_ref_unlock(head);
}
-static int cleanup_extent_op(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_head *head)
+static struct btrfs_delayed_extent_op *cleanup_extent_op(
+ struct btrfs_delayed_ref_head *head)
{
struct btrfs_delayed_extent_op *extent_op = head->extent_op;
- int ret;
if (!extent_op)
- return 0;
- head->extent_op = NULL;
+ return NULL;
+
if (head->must_insert_reserved) {
+ head->extent_op = NULL;
btrfs_free_delayed_extent_op(extent_op);
- return 0;
+ return NULL;
}
+ return extent_op;
+}
+
+static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head)
+{
+ struct btrfs_delayed_extent_op *extent_op;
+ int ret;
+
+ extent_op = cleanup_extent_op(head);
+ if (!extent_op)
+ return 0;
+ head->extent_op = NULL;
spin_unlock(&head->lock);
ret = run_delayed_extent_op(trans, head, extent_op);
btrfs_free_delayed_extent_op(extent_op);
return ret ? ret : 1;
}
+static void cleanup_ref_head_accounting(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_delayed_ref_root *delayed_refs =
+ &trans->transaction->delayed_refs;
+ int nr_items = 1; /* Dropping this ref head update. */
+
+ if (head->total_ref_mod < 0) {
+ struct btrfs_space_info *space_info;
+ u64 flags;
+
+ if (head->is_data)
+ flags = BTRFS_BLOCK_GROUP_DATA;
+ else if (head->is_system)
+ flags = BTRFS_BLOCK_GROUP_SYSTEM;
+ else
+ flags = BTRFS_BLOCK_GROUP_METADATA;
+ space_info = __find_space_info(fs_info, flags);
+ ASSERT(space_info);
+ percpu_counter_add_batch(&space_info->total_bytes_pinned,
+ -head->num_bytes,
+ BTRFS_TOTAL_BYTES_PINNED_BATCH);
+
+ /*
+ * We had csum deletions accounted for in our delayed refs rsv,
+ * we need to drop the csum leaves for this update from our
+ * delayed_refs_rsv.
+ */
+ if (head->is_data) {
+ spin_lock(&delayed_refs->lock);
+ delayed_refs->pending_csums -= head->num_bytes;
+ spin_unlock(&delayed_refs->lock);
+ nr_items += btrfs_csum_bytes_to_leaves(fs_info,
+ head->num_bytes);
+ }
+ }
+
+ /* Also free its reserved qgroup space */
+ btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
+ head->qgroup_reserved);
+ btrfs_delayed_refs_rsv_release(fs_info, nr_items);
+}
+
static int cleanup_ref_head(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head)
{
@@ -2435,7 +2510,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
delayed_refs = &trans->transaction->delayed_refs;
- ret = cleanup_extent_op(trans, head);
+ ret = run_and_cleanup_extent_op(trans, head);
if (ret < 0) {
unselect_delayed_ref_head(delayed_refs, head);
btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
@@ -2456,37 +2531,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
spin_unlock(&delayed_refs->lock);
return 1;
}
- delayed_refs->num_heads--;
- rb_erase_cached(&head->href_node, &delayed_refs->href_root);
- RB_CLEAR_NODE(&head->href_node);
+ btrfs_delete_ref_head(delayed_refs, head);
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
- atomic_dec(&delayed_refs->num_entries);
-
- trace_run_delayed_ref_head(fs_info, head, 0);
-
- if (head->total_ref_mod < 0) {
- struct btrfs_space_info *space_info;
- u64 flags;
-
- if (head->is_data)
- flags = BTRFS_BLOCK_GROUP_DATA;
- else if (head->is_system)
- flags = BTRFS_BLOCK_GROUP_SYSTEM;
- else
- flags = BTRFS_BLOCK_GROUP_METADATA;
- space_info = __find_space_info(fs_info, flags);
- ASSERT(space_info);
- percpu_counter_add_batch(&space_info->total_bytes_pinned,
- -head->num_bytes,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
-
- if (head->is_data) {
- spin_lock(&delayed_refs->lock);
- delayed_refs->pending_csums -= head->num_bytes;
- spin_unlock(&delayed_refs->lock);
- }
- }
if (head->must_insert_reserved) {
btrfs_pin_extent(fs_info, head->bytenr,
@@ -2497,9 +2544,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
}
}
- /* Also free its reserved qgroup space */
- btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
- head->qgroup_reserved);
+ cleanup_ref_head_accounting(trans, head);
+
+ trace_run_delayed_ref_head(fs_info, head, 0);
btrfs_delayed_ref_unlock(head);
btrfs_put_delayed_ref_head(head);
return 0;
@@ -2792,40 +2839,28 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
return num_csums;
}
-int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans)
+bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_rsv *global_rsv;
- u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
- u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
- unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs;
- u64 num_bytes, num_dirty_bgs_bytes;
- int ret = 0;
+ struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ bool ret = false;
+ u64 reserved;
- num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
- num_heads = heads_to_leaves(fs_info, num_heads);
- if (num_heads > 1)
- num_bytes += (num_heads - 1) * fs_info->nodesize;
- num_bytes <<= 1;
- num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) *
- fs_info->nodesize;
- num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info,
- num_dirty_bgs);
- global_rsv = &fs_info->global_block_rsv;
+ spin_lock(&global_rsv->lock);
+ reserved = global_rsv->reserved;
+ spin_unlock(&global_rsv->lock);
/*
- * If we can't allocate any more chunks lets make sure we have _lots_ of
- * wiggle room since running delayed refs can create more delayed refs.
+ * Since the global reserve is just kind of magic we don't really want
+ * to rely on it to save our bacon, so if our size is more than the
+ * delayed_refs_rsv and the global rsv then it's time to think about
+ * bailing.
*/
- if (global_rsv->space_info->full) {
- num_dirty_bgs_bytes <<= 1;
- num_bytes <<= 1;
- }
-
- spin_lock(&global_rsv->lock);
- if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
- ret = 1;
- spin_unlock(&global_rsv->lock);
+ spin_lock(&delayed_refs_rsv->lock);
+ reserved += delayed_refs_rsv->reserved;
+ if (delayed_refs_rsv->size >= reserved)
+ ret = true;
+ spin_unlock(&delayed_refs_rsv->lock);
return ret;
}
@@ -2844,7 +2879,7 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
if (val >= NSEC_PER_SEC / 2)
return 2;
- return btrfs_check_space_for_delayed_refs(trans);
+ return btrfs_check_space_for_delayed_refs(trans->fs_info);
}
struct async_delayed_refs {
@@ -3588,6 +3623,8 @@ again:
*/
mutex_lock(&trans->transaction->cache_write_mutex);
while (!list_empty(&dirty)) {
+ bool drop_reserve = true;
+
cache = list_first_entry(&dirty,
struct btrfs_block_group_cache,
dirty_list);
@@ -3660,6 +3697,7 @@ again:
list_add_tail(&cache->dirty_list,
&cur_trans->dirty_bgs);
btrfs_get_block_group(cache);
+ drop_reserve = false;
}
spin_unlock(&cur_trans->dirty_bgs_lock);
} else if (ret) {
@@ -3667,9 +3705,11 @@ again:
}
}
- /* if its not on the io list, we need to put the block group */
+ /* if it's not on the io list, we need to put the block group */
if (should_put)
btrfs_put_block_group(cache);
+ if (drop_reserve)
+ btrfs_delayed_refs_rsv_release(fs_info, 1);
if (ret)
break;
@@ -3818,6 +3858,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
/* if its not on the io list, we need to put the block group */
if (should_put)
btrfs_put_block_group(cache);
+ btrfs_delayed_refs_rsv_release(fs_info, 1);
spin_lock(&cur_trans->dirty_bgs_lock);
}
spin_unlock(&cur_trans->dirty_bgs_lock);
@@ -4256,7 +4297,7 @@ commit_trans:
data_sinfo->flags, bytes, 1);
return -ENOSPC;
}
- data_sinfo->bytes_may_use += bytes;
+ update_bytes_may_use(data_sinfo, bytes);
trace_btrfs_space_reservation(fs_info, "space_info",
data_sinfo->flags, bytes, 1);
spin_unlock(&data_sinfo->lock);
@@ -4309,10 +4350,7 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
data_sinfo = fs_info->data_sinfo;
spin_lock(&data_sinfo->lock);
- if (WARN_ON(data_sinfo->bytes_may_use < len))
- data_sinfo->bytes_may_use = 0;
- else
- data_sinfo->bytes_may_use -= len;
+ update_bytes_may_use(data_sinfo, -len);
trace_btrfs_space_reservation(fs_info, "space_info",
data_sinfo->flags, len, 0);
spin_unlock(&data_sinfo->lock);
@@ -4637,7 +4675,7 @@ static int can_overcommit(struct btrfs_fs_info *fs_info,
/*
* If we have dup, raid1 or raid10 then only half of the free
- * space is actually useable. For raid56, the space info used
+ * space is actually usable. For raid56, the space info used
* doesn't include the parity drive, so we don't have to
* change the math
*/
@@ -4793,8 +4831,10 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
{
struct reserve_ticket *ticket = NULL;
struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
+ struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
struct btrfs_trans_handle *trans;
- u64 bytes;
+ u64 bytes_needed;
+ u64 reclaim_bytes = 0;
trans = (struct btrfs_trans_handle *)current->journal_info;
if (trans)
@@ -4807,15 +4847,15 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
else if (!list_empty(&space_info->tickets))
ticket = list_first_entry(&space_info->tickets,
struct reserve_ticket, list);
- bytes = (ticket) ? ticket->bytes : 0;
+ bytes_needed = (ticket) ? ticket->bytes : 0;
spin_unlock(&space_info->lock);
- if (!bytes)
+ if (!bytes_needed)
return 0;
/* See if there is enough pinned space to make this reservation */
if (__percpu_counter_compare(&space_info->total_bytes_pinned,
- bytes,
+ bytes_needed,
BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
goto commit;
@@ -4827,14 +4867,18 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
return -ENOSPC;
spin_lock(&delayed_rsv->lock);
- if (delayed_rsv->size > bytes)
- bytes = 0;
- else
- bytes -= delayed_rsv->size;
+ reclaim_bytes += delayed_rsv->reserved;
spin_unlock(&delayed_rsv->lock);
+ spin_lock(&delayed_refs_rsv->lock);
+ reclaim_bytes += delayed_refs_rsv->reserved;
+ spin_unlock(&delayed_refs_rsv->lock);
+ if (reclaim_bytes >= bytes_needed)
+ goto commit;
+ bytes_needed -= reclaim_bytes;
+
if (__percpu_counter_compare(&space_info->total_bytes_pinned,
- bytes,
+ bytes_needed,
BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) {
return -ENOSPC;
}
@@ -4882,6 +4926,20 @@ static void flush_space(struct btrfs_fs_info *fs_info,
shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
state == FLUSH_DELALLOC_WAIT);
break;
+ case FLUSH_DELAYED_REFS_NR:
+ case FLUSH_DELAYED_REFS:
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+ if (state == FLUSH_DELAYED_REFS_NR)
+ nr = calc_reclaim_items_nr(fs_info, num_bytes);
+ else
+ nr = 0;
+ btrfs_run_delayed_refs(trans, nr);
+ btrfs_end_transaction(trans);
+ break;
case ALLOC_CHUNK:
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
@@ -5108,7 +5166,7 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
list_del_init(&ticket->list);
if (ticket->bytes && ticket->bytes < orig_bytes) {
u64 num_bytes = orig_bytes - ticket->bytes;
- space_info->bytes_may_use -= num_bytes;
+ update_bytes_may_use(space_info, -num_bytes);
trace_btrfs_space_reservation(fs_info, "space_info",
space_info->flags, num_bytes, 0);
}
@@ -5154,13 +5212,13 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
* If not things get more complicated.
*/
if (used + orig_bytes <= space_info->total_bytes) {
- space_info->bytes_may_use += orig_bytes;
+ update_bytes_may_use(space_info, orig_bytes);
trace_btrfs_space_reservation(fs_info, "space_info",
space_info->flags, orig_bytes, 1);
ret = 0;
} else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
system_chunk)) {
- space_info->bytes_may_use += orig_bytes;
+ update_bytes_may_use(space_info, orig_bytes);
trace_btrfs_space_reservation(fs_info, "space_info",
space_info->flags, orig_bytes, 1);
ret = 0;
@@ -5223,7 +5281,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
if (ticket.bytes) {
if (ticket.bytes < orig_bytes) {
u64 num_bytes = orig_bytes - ticket.bytes;
- space_info->bytes_may_use -= num_bytes;
+ update_bytes_may_use(space_info, -num_bytes);
trace_btrfs_space_reservation(fs_info, "space_info",
space_info->flags,
num_bytes, 0);
@@ -5244,7 +5302,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
* @orig_bytes - the number of bytes we want
* @flush - whether or not we can flush to make our reservation
*
- * This will reserve orgi_bytes number of bytes from the space info associated
+ * This will reserve orig_bytes number of bytes from the space info associated
* with the block_rsv. If there is not enough space it will make an attempt to
* flush out space to make room. It will do this by flushing delalloc if
* possible or committing the transaction. If flush is 0 then no attempts to
@@ -5354,6 +5412,90 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
return 0;
}
+/**
+ * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv.
+ * @fs_info - the fs info for our fs.
+ * @src - the source block rsv to transfer from.
+ * @num_bytes - the number of bytes to transfer.
+ *
+ * This transfers up to the num_bytes amount from the src rsv to the
+ * delayed_refs_rsv. Any extra bytes are returned to the space info.
+ */
+void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *src,
+ u64 num_bytes)
+{
+ struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+ u64 to_free = 0;
+
+ spin_lock(&src->lock);
+ src->reserved -= num_bytes;
+ src->size -= num_bytes;
+ spin_unlock(&src->lock);
+
+ spin_lock(&delayed_refs_rsv->lock);
+ if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
+ u64 delta = delayed_refs_rsv->size -
+ delayed_refs_rsv->reserved;
+ if (num_bytes > delta) {
+ to_free = num_bytes - delta;
+ num_bytes = delta;
+ }
+ } else {
+ to_free = num_bytes;
+ num_bytes = 0;
+ }
+
+ if (num_bytes)
+ delayed_refs_rsv->reserved += num_bytes;
+ if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
+ delayed_refs_rsv->full = 1;
+ spin_unlock(&delayed_refs_rsv->lock);
+
+ if (num_bytes)
+ trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+ 0, num_bytes, 1);
+ if (to_free)
+ space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info,
+ to_free);
+}
+
+/**
+ * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage.
+ * @fs_info - the fs_info for our fs.
+ * @flush - control how we can flush for this reservation.
+ *
+ * This will refill the delayed block_rsv up to 1 items size worth of space and
+ * will return -ENOSPC if we can't make the reservation.
+ */
+int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
+ enum btrfs_reserve_flush_enum flush)
+{
+ struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
+ u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1);
+ u64 num_bytes = 0;
+ int ret = -ENOSPC;
+
+ spin_lock(&block_rsv->lock);
+ if (block_rsv->reserved < block_rsv->size) {
+ num_bytes = block_rsv->size - block_rsv->reserved;
+ num_bytes = min(num_bytes, limit);
+ }
+ spin_unlock(&block_rsv->lock);
+
+ if (!num_bytes)
+ return 0;
+
+ ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv,
+ num_bytes, flush);
+ if (ret)
+ return ret;
+ block_rsv_add_bytes(block_rsv, num_bytes, 0);
+ trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+ 0, num_bytes, 1);
+ return 0;
+}
+
/*
* This is for space we already have accounted in space_info->bytes_may_use, so
* basically when we're returning space from block_rsv's.
@@ -5407,7 +5549,7 @@ again:
flush = BTRFS_RESERVE_FLUSH_ALL;
goto again;
}
- space_info->bytes_may_use -= num_bytes;
+ update_bytes_may_use(space_info, -num_bytes);
trace_btrfs_space_reservation(fs_info, "space_info",
space_info->flags, num_bytes, 0);
spin_unlock(&space_info->lock);
@@ -5435,7 +5577,7 @@ again:
ticket->bytes, 1);
list_del_init(&ticket->list);
num_bytes -= ticket->bytes;
- space_info->bytes_may_use += ticket->bytes;
+ update_bytes_may_use(space_info, ticket->bytes);
ticket->bytes = 0;
space_info->tickets_id++;
wake_up(&ticket->wait);
@@ -5443,7 +5585,7 @@ again:
trace_btrfs_space_reservation(fs_info, "space_info",
space_info->flags,
num_bytes, 1);
- space_info->bytes_may_use += num_bytes;
+ update_bytes_may_use(space_info, num_bytes);
ticket->bytes -= num_bytes;
num_bytes = 0;
}
@@ -5629,11 +5771,11 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
/**
* btrfs_inode_rsv_refill - refill the inode block rsv.
* @inode - the inode we are refilling.
- * @flush - the flusing restriction.
+ * @flush - the flushing restriction.
*
* Essentially the same as btrfs_block_rsv_refill, except it uses the
* block_rsv->size as the minimum size. We'll either refill the missing amount
- * or return if we already have enough space. This will also handle the resreve
+ * or return if we already have enough space. This will also handle the reserve
* tracepoint for the reserved amount.
*/
static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
@@ -5674,6 +5816,31 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
return ret;
}
+static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes, u64 *qgroup_to_release)
+{
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+ struct btrfs_block_rsv *target = delayed_rsv;
+
+ if (target->full || target == block_rsv)
+ target = global_rsv;
+
+ if (block_rsv->space_info != target->space_info)
+ target = NULL;
+
+ return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
+ qgroup_to_release);
+}
+
+void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes)
+{
+ __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
+}
+
/**
* btrfs_inode_rsv_release - release any excessive reservation.
* @inode - the inode we need to release from.
@@ -5688,7 +5855,6 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
u64 released = 0;
u64 qgroup_to_release = 0;
@@ -5698,8 +5864,8 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
* are releasing 0 bytes, and then we'll just get the reservation over
* the size free'd.
*/
- released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0,
- &qgroup_to_release);
+ released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
+ &qgroup_to_release);
if (released > 0)
trace_btrfs_space_reservation(fs_info, "delalloc",
btrfs_ino(inode), released, 0);
@@ -5710,16 +5876,26 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
qgroup_to_release);
}
-void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes)
+/**
+ * btrfs_delayed_refs_rsv_release - release a ref head's reservation.
+ * @fs_info - the fs_info for our fs.
+ * @nr - the number of items to drop.
+ *
+ * This drops the delayed ref head's count from the delayed refs rsv and frees
+ * any excess reservation we had.
+ */
+void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
{
+ struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr);
+ u64 released = 0;
- if (global_rsv == block_rsv ||
- block_rsv->space_info != global_rsv->space_info)
- global_rsv = NULL;
- block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL);
+ released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv,
+ num_bytes, NULL);
+ if (released)
+ trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+ 0, released, 0);
}
static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
@@ -5750,14 +5926,14 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
num_bytes = min(num_bytes,
block_rsv->size - block_rsv->reserved);
block_rsv->reserved += num_bytes;
- sinfo->bytes_may_use += num_bytes;
+ update_bytes_may_use(sinfo, num_bytes);
trace_btrfs_space_reservation(fs_info, "space_info",
sinfo->flags, num_bytes,
1);
}
} else if (block_rsv->reserved > block_rsv->size) {
num_bytes = block_rsv->reserved - block_rsv->size;
- sinfo->bytes_may_use -= num_bytes;
+ update_bytes_may_use(sinfo, -num_bytes);
trace_btrfs_space_reservation(fs_info, "space_info",
sinfo->flags, num_bytes, 0);
block_rsv->reserved = block_rsv->size;
@@ -5784,9 +5960,10 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
fs_info->trans_block_rsv.space_info = space_info;
fs_info->empty_block_rsv.space_info = space_info;
fs_info->delayed_block_rsv.space_info = space_info;
+ fs_info->delayed_refs_rsv.space_info = space_info;
- fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
- fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
+ fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
+ fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
if (fs_info->quota_root)
@@ -5806,8 +5983,34 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
WARN_ON(fs_info->delayed_block_rsv.size > 0);
WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
+ WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
+ WARN_ON(fs_info->delayed_refs_rsv.size > 0);
}
+/*
+ * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
+ * @trans - the trans that may have generated delayed refs
+ *
+ * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
+ * it'll calculate the additional size and add it to the delayed_refs_rsv.
+ */
+void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+ u64 num_bytes;
+
+ if (!trans->delayed_ref_updates)
+ return;
+
+ num_bytes = btrfs_calc_trans_metadata_size(fs_info,
+ trans->delayed_ref_updates);
+ spin_lock(&delayed_rsv->lock);
+ delayed_rsv->size += num_bytes;
+ delayed_rsv->full = 0;
+ spin_unlock(&delayed_rsv->lock);
+ trans->delayed_ref_updates = 0;
+}
/*
* To be called after all the new block groups attached to the transaction
@@ -6100,6 +6303,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
u64 old_val;
u64 byte_in_group;
int factor;
+ int ret = 0;
/* block accounting for super block */
spin_lock(&info->delalloc_root_lock);
@@ -6113,8 +6317,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
while (total) {
cache = btrfs_lookup_block_group(info, bytenr);
- if (!cache)
- return -ENOENT;
+ if (!cache) {
+ ret = -ENOENT;
+ break;
+ }
factor = btrfs_bg_type_to_factor(cache->flags);
/*
@@ -6151,7 +6357,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
old_val -= num_bytes;
btrfs_set_block_group_used(&cache->item, old_val);
cache->pinned += num_bytes;
- cache->space_info->bytes_pinned += num_bytes;
+ update_bytes_pinned(cache->space_info, num_bytes);
cache->space_info->bytes_used -= num_bytes;
cache->space_info->disk_used -= num_bytes * factor;
spin_unlock(&cache->lock);
@@ -6173,6 +6379,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
list_add_tail(&cache->dirty_list,
&trans->transaction->dirty_bgs);
trans->transaction->num_dirty_bgs++;
+ trans->delayed_ref_updates++;
btrfs_get_block_group(cache);
}
spin_unlock(&trans->transaction->dirty_bgs_lock);
@@ -6190,7 +6397,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
total -= num_bytes;
bytenr += num_bytes;
}
- return 0;
+
+ /* Modified block groups are accounted for in the delayed_refs_rsv. */
+ btrfs_update_delayed_refs_rsv(trans);
+ return ret;
}
static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
@@ -6222,7 +6432,7 @@ static int pin_down_extent(struct btrfs_fs_info *fs_info,
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
cache->pinned += num_bytes;
- cache->space_info->bytes_pinned += num_bytes;
+ update_bytes_pinned(cache->space_info, num_bytes);
if (reserved) {
cache->reserved -= num_bytes;
cache->space_info->bytes_reserved -= num_bytes;
@@ -6431,7 +6641,7 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
} else {
cache->reserved += num_bytes;
space_info->bytes_reserved += num_bytes;
- space_info->bytes_may_use -= ram_bytes;
+ update_bytes_may_use(space_info, -ram_bytes);
if (delalloc)
cache->delalloc_bytes += num_bytes;
}
@@ -6587,7 +6797,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
cache->pinned -= len;
- space_info->bytes_pinned -= len;
+ update_bytes_pinned(space_info, -len);
trace_btrfs_space_reservation(fs_info, "pinned",
space_info->flags, len, 0);
@@ -6608,7 +6818,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
to_add = min(len, global_rsv->size -
global_rsv->reserved);
global_rsv->reserved += to_add;
- space_info->bytes_may_use += to_add;
+ update_bytes_may_use(space_info, to_add);
if (global_rsv->reserved >= global_rsv->size)
global_rsv->full = 1;
trace_btrfs_space_reservation(fs_info,
@@ -6647,9 +6857,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
unpin = &fs_info->freed_extents[0];
while (!trans->aborted) {
+ struct extent_state *cached_state = NULL;
+
mutex_lock(&fs_info->unused_bg_unpin_mutex);
ret = find_first_extent_bit(unpin, 0, &start, &end,
- EXTENT_DIRTY, NULL);
+ EXTENT_DIRTY, &cached_state);
if (ret) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
break;
@@ -6659,9 +6871,10 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
ret = btrfs_discard_extent(fs_info, start,
end + 1 - start, NULL);
- clear_extent_dirty(unpin, start, end);
+ clear_extent_dirty(unpin, start, end, &cached_state);
unpin_extent_range(fs_info, start, end, true);
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ free_extent_state(cached_state);
cond_resched();
}
@@ -6955,12 +7168,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
goto out;
- if (head->extent_op) {
- if (!head->must_insert_reserved)
- goto out;
- btrfs_free_delayed_extent_op(head->extent_op);
- head->extent_op = NULL;
- }
+ if (cleanup_extent_op(head) != NULL)
+ goto out;
/*
* waiting for the lock here would deadlock. If someone else has it
@@ -6969,22 +7178,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
if (!mutex_trylock(&head->mutex))
goto out;
- /*
- * at this point we have a head with no other entries. Go
- * ahead and process it.
- */
- rb_erase_cached(&head->href_node, &delayed_refs->href_root);
- RB_CLEAR_NODE(&head->href_node);
- atomic_dec(&delayed_refs->num_entries);
-
- /*
- * we don't take a ref on the node because we're removing it from the
- * tree, so we just steal the ref the tree was holding.
- */
- delayed_refs->num_heads--;
- if (head->processing == 0)
- delayed_refs->num_heads_ready--;
+ btrfs_delete_ref_head(delayed_refs, head);
head->processing = 0;
+
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
@@ -6992,6 +7188,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
if (head->must_insert_reserved)
ret = 1;
+ cleanup_ref_head_accounting(trans, head);
mutex_unlock(&head->mutex);
btrfs_put_delayed_ref_head(head);
return ret;
@@ -7239,6 +7436,345 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache,
}
/*
+ * Structure used internally for find_free_extent() function. Wraps needed
+ * parameters.
+ */
+struct find_free_extent_ctl {
+ /* Basic allocation info */
+ u64 ram_bytes;
+ u64 num_bytes;
+ u64 empty_size;
+ u64 flags;
+ int delalloc;
+
+ /* Where to start the search inside the bg */
+ u64 search_start;
+
+ /* For clustered allocation */
+ u64 empty_cluster;
+
+ bool have_caching_bg;
+ bool orig_have_caching_bg;
+
+ /* RAID index, converted from flags */
+ int index;
+
+ /*
+ * Current loop number, check find_free_extent_update_loop() for details
+ */
+ int loop;
+
+ /*
+ * Whether we're refilling a cluster, if true we need to re-search
+ * current block group but don't try to refill the cluster again.
+ */
+ bool retry_clustered;
+
+ /*
+ * Whether we're updating free space cache, if true we need to re-search
+ * current block group but don't try updating free space cache again.
+ */
+ bool retry_unclustered;
+
+ /* If current block group is cached */
+ int cached;
+
+ /* Max contiguous hole found */
+ u64 max_extent_size;
+
+ /* Total free space from free space cache, not always contiguous */
+ u64 total_free_space;
+
+ /* Found result */
+ u64 found_offset;
+};
+
+
+/*
+ * Helper function for find_free_extent().
+ *
+ * Return -ENOENT to inform caller that we need fallback to unclustered mode.
+ * Return -EAGAIN to inform caller that we need to re-search this block group
+ * Return >0 to inform caller that we find nothing
+ * Return 0 means we have found a location and set ffe_ctl->found_offset.
+ */
+static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
+ struct btrfs_free_cluster *last_ptr,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_block_group_cache **cluster_bg_ret)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+ struct btrfs_block_group_cache *cluster_bg;
+ u64 aligned_cluster;
+ u64 offset;
+ int ret;
+
+ cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
+ if (!cluster_bg)
+ goto refill_cluster;
+ if (cluster_bg != bg && (cluster_bg->ro ||
+ !block_group_bits(cluster_bg, ffe_ctl->flags)))
+ goto release_cluster;
+
+ offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
+ ffe_ctl->num_bytes, cluster_bg->key.objectid,
+ &ffe_ctl->max_extent_size);
+ if (offset) {
+ /* We have a block, we're done */
+ spin_unlock(&last_ptr->refill_lock);
+ trace_btrfs_reserve_extent_cluster(cluster_bg,
+ ffe_ctl->search_start, ffe_ctl->num_bytes);
+ *cluster_bg_ret = cluster_bg;
+ ffe_ctl->found_offset = offset;
+ return 0;
+ }
+ WARN_ON(last_ptr->block_group != cluster_bg);
+
+release_cluster:
+ /*
+ * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
+ * lets just skip it and let the allocator find whatever block it can
+ * find. If we reach this point, we will have tried the cluster
+ * allocator plenty of times and not have found anything, so we are
+ * likely way too fragmented for the clustering stuff to find anything.
+ *
+ * However, if the cluster is taken from the current block group,
+ * release the cluster first, so that we stand a better chance of
+ * succeeding in the unclustered allocation.
+ */
+ if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
+ spin_unlock(&last_ptr->refill_lock);
+ btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
+ return -ENOENT;
+ }
+
+ /* This cluster didn't work out, free it and start over */
+ btrfs_return_cluster_to_free_space(NULL, last_ptr);
+
+ if (cluster_bg != bg)
+ btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
+
+refill_cluster:
+ if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
+ spin_unlock(&last_ptr->refill_lock);
+ return -ENOENT;
+ }
+
+ aligned_cluster = max_t(u64,
+ ffe_ctl->empty_cluster + ffe_ctl->empty_size,
+ bg->full_stripe_len);
+ ret = btrfs_find_space_cluster(fs_info, bg, last_ptr,
+ ffe_ctl->search_start, ffe_ctl->num_bytes,
+ aligned_cluster);
+ if (ret == 0) {
+ /* Now pull our allocation out of this cluster */
+ offset = btrfs_alloc_from_cluster(bg, last_ptr,
+ ffe_ctl->num_bytes, ffe_ctl->search_start,
+ &ffe_ctl->max_extent_size);
+ if (offset) {
+ /* We found one, proceed */
+ spin_unlock(&last_ptr->refill_lock);
+ trace_btrfs_reserve_extent_cluster(bg,
+ ffe_ctl->search_start,
+ ffe_ctl->num_bytes);
+ ffe_ctl->found_offset = offset;
+ return 0;
+ }
+ } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
+ !ffe_ctl->retry_clustered) {
+ spin_unlock(&last_ptr->refill_lock);
+
+ ffe_ctl->retry_clustered = true;
+ wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
+ ffe_ctl->empty_cluster + ffe_ctl->empty_size);
+ return -EAGAIN;
+ }
+ /*
+ * At this point we either didn't find a cluster or we weren't able to
+ * allocate a block from our cluster. Free the cluster we've been
+ * trying to use, and go to the next block group.
+ */
+ btrfs_return_cluster_to_free_space(NULL, last_ptr);
+ spin_unlock(&last_ptr->refill_lock);
+ return 1;
+}
+
+/*
+ * Return >0 to inform caller that we find nothing
+ * Return 0 when we found an free extent and set ffe_ctrl->found_offset
+ * Return -EAGAIN to inform caller that we need to re-search this block group
+ */
+static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
+ struct btrfs_free_cluster *last_ptr,
+ struct find_free_extent_ctl *ffe_ctl)
+{
+ u64 offset;
+
+ /*
+ * We are doing an unclustered allocation, set the fragmented flag so
+ * we don't bother trying to setup a cluster again until we get more
+ * space.
+ */
+ if (unlikely(last_ptr)) {
+ spin_lock(&last_ptr->lock);
+ last_ptr->fragmented = 1;
+ spin_unlock(&last_ptr->lock);
+ }
+ if (ffe_ctl->cached) {
+ struct btrfs_free_space_ctl *free_space_ctl;
+
+ free_space_ctl = bg->free_space_ctl;
+ spin_lock(&free_space_ctl->tree_lock);
+ if (free_space_ctl->free_space <
+ ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
+ ffe_ctl->empty_size) {
+ ffe_ctl->total_free_space = max_t(u64,
+ ffe_ctl->total_free_space,
+ free_space_ctl->free_space);
+ spin_unlock(&free_space_ctl->tree_lock);
+ return 1;
+ }
+ spin_unlock(&free_space_ctl->tree_lock);
+ }
+
+ offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
+ ffe_ctl->num_bytes, ffe_ctl->empty_size,
+ &ffe_ctl->max_extent_size);
+
+ /*
+ * If we didn't find a chunk, and we haven't failed on this block group
+ * before, and this block group is in the middle of caching and we are
+ * ok with waiting, then go ahead and wait for progress to be made, and
+ * set @retry_unclustered to true.
+ *
+ * If @retry_unclustered is true then we've already waited on this
+ * block group once and should move on to the next block group.
+ */
+ if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
+ ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
+ wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
+ ffe_ctl->empty_size);
+ ffe_ctl->retry_unclustered = true;
+ return -EAGAIN;
+ } else if (!offset) {
+ return 1;
+ }
+ ffe_ctl->found_offset = offset;
+ return 0;
+}
+
+/*
+ * Return >0 means caller needs to re-search for free extent
+ * Return 0 means we have the needed free extent.
+ * Return <0 means we failed to locate any free extent.
+ */
+static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
+ struct btrfs_free_cluster *last_ptr,
+ struct btrfs_key *ins,
+ struct find_free_extent_ctl *ffe_ctl,
+ int full_search, bool use_cluster)
+{
+ struct btrfs_root *root = fs_info->extent_root;
+ int ret;
+
+ if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
+ ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
+ ffe_ctl->orig_have_caching_bg = true;
+
+ if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
+ ffe_ctl->have_caching_bg)
+ return 1;
+
+ if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
+ return 1;
+
+ if (ins->objectid) {
+ if (!use_cluster && last_ptr) {
+ spin_lock(&last_ptr->lock);
+ last_ptr->window_start = ins->objectid;
+ spin_unlock(&last_ptr->lock);
+ }
+ return 0;
+ }
+
+ /*
+ * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
+ * caching kthreads as we move along
+ * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
+ * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
+ * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
+ * again
+ */
+ if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
+ ffe_ctl->index = 0;
+ if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
+ /*
+ * We want to skip the LOOP_CACHING_WAIT step if we
+ * don't have any uncached bgs and we've already done a
+ * full search through.
+ */
+ if (ffe_ctl->orig_have_caching_bg || !full_search)
+ ffe_ctl->loop = LOOP_CACHING_WAIT;
+ else
+ ffe_ctl->loop = LOOP_ALLOC_CHUNK;
+ } else {
+ ffe_ctl->loop++;
+ }
+
+ if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
+ struct btrfs_trans_handle *trans;
+ int exist = 0;
+
+ trans = current->journal_info;
+ if (trans)
+ exist = 1;
+ else
+ trans = btrfs_join_transaction(root);
+
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ return ret;
+ }
+
+ ret = do_chunk_alloc(trans, ffe_ctl->flags,
+ CHUNK_ALLOC_FORCE);
+
+ /*
+ * If we can't allocate a new chunk we've already looped
+ * through at least once, move on to the NO_EMPTY_SIZE
+ * case.
+ */
+ if (ret == -ENOSPC)
+ ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
+
+ /* Do not bail out on ENOSPC since we can do more. */
+ if (ret < 0 && ret != -ENOSPC)
+ btrfs_abort_transaction(trans, ret);
+ else
+ ret = 0;
+ if (!exist)
+ btrfs_end_transaction(trans);
+ if (ret)
+ return ret;
+ }
+
+ if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
+ /*
+ * Don't loop again if we already have no empty_size and
+ * no empty_cluster.
+ */
+ if (ffe_ctl->empty_size == 0 &&
+ ffe_ctl->empty_cluster == 0)
+ return -ENOSPC;
+ ffe_ctl->empty_size = 0;
+ ffe_ctl->empty_cluster = 0;
+ }
+ return 1;
+ }
+ return -ENOSPC;
+}
+
+/*
* walks the btree of allocated extents and find a hole of a given size.
* The key ins is changed to record the hole:
* ins->objectid == start position
@@ -7248,6 +7784,20 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache,
*
* If there is no suitable free space, we will record the max size of
* the free space extent currently.
+ *
+ * The overall logic and call chain:
+ *
+ * find_free_extent()
+ * |- Iterate through all block groups
+ * | |- Get a valid block group
+ * | |- Try to do clustered allocation in that block group
+ * | |- Try to do unclustered allocation in that block group
+ * | |- Check if the result is valid
+ * | | |- If valid, then exit
+ * | |- Jump to next block group
+ * |
+ * |- Push harder to find free extents
+ * |- If not found, re-iterate all block groups
*/
static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
u64 ram_bytes, u64 num_bytes, u64 empty_size,
@@ -7255,24 +7805,28 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
u64 flags, int delalloc)
{
int ret = 0;
- struct btrfs_root *root = fs_info->extent_root;
struct btrfs_free_cluster *last_ptr = NULL;
struct btrfs_block_group_cache *block_group = NULL;
- u64 search_start = 0;
- u64 max_extent_size = 0;
- u64 max_free_space = 0;
- u64 empty_cluster = 0;
+ struct find_free_extent_ctl ffe_ctl = {0};
struct btrfs_space_info *space_info;
- int loop = 0;
- int index = btrfs_bg_flags_to_raid_index(flags);
- bool failed_cluster_refill = false;
- bool failed_alloc = false;
bool use_cluster = true;
- bool have_caching_bg = false;
- bool orig_have_caching_bg = false;
bool full_search = false;
WARN_ON(num_bytes < fs_info->sectorsize);
+
+ ffe_ctl.ram_bytes = ram_bytes;
+ ffe_ctl.num_bytes = num_bytes;
+ ffe_ctl.empty_size = empty_size;
+ ffe_ctl.flags = flags;
+ ffe_ctl.search_start = 0;
+ ffe_ctl.retry_clustered = false;
+ ffe_ctl.retry_unclustered = false;
+ ffe_ctl.delalloc = delalloc;
+ ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
+ ffe_ctl.have_caching_bg = false;
+ ffe_ctl.orig_have_caching_bg = false;
+ ffe_ctl.found_offset = 0;
+
ins->type = BTRFS_EXTENT_ITEM_KEY;
ins->objectid = 0;
ins->offset = 0;
@@ -7308,7 +7862,8 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
spin_unlock(&space_info->lock);
}
- last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster);
+ last_ptr = fetch_cluster_info(fs_info, space_info,
+ &ffe_ctl.empty_cluster);
if (last_ptr) {
spin_lock(&last_ptr->lock);
if (last_ptr->block_group)
@@ -7325,10 +7880,12 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
spin_unlock(&last_ptr->lock);
}
- search_start = max(search_start, first_logical_byte(fs_info, 0));
- search_start = max(search_start, hint_byte);
- if (search_start == hint_byte) {
- block_group = btrfs_lookup_block_group(fs_info, search_start);
+ ffe_ctl.search_start = max(ffe_ctl.search_start,
+ first_logical_byte(fs_info, 0));
+ ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
+ if (ffe_ctl.search_start == hint_byte) {
+ block_group = btrfs_lookup_block_group(fs_info,
+ ffe_ctl.search_start);
/*
* we don't want to use the block group if it doesn't match our
* allocation bits, or if its not cached.
@@ -7350,7 +7907,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
btrfs_put_block_group(block_group);
up_read(&space_info->groups_sem);
} else {
- index = btrfs_bg_flags_to_raid_index(
+ ffe_ctl.index = btrfs_bg_flags_to_raid_index(
block_group->flags);
btrfs_lock_block_group(block_group, delalloc);
goto have_block_group;
@@ -7360,21 +7917,19 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
}
}
search:
- have_caching_bg = false;
- if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags))
+ ffe_ctl.have_caching_bg = false;
+ if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
+ ffe_ctl.index == 0)
full_search = true;
down_read(&space_info->groups_sem);
- list_for_each_entry(block_group, &space_info->block_groups[index],
- list) {
- u64 offset;
- int cached;
-
+ list_for_each_entry(block_group,
+ &space_info->block_groups[ffe_ctl.index], list) {
/* If the block group is read-only, we can skip it entirely. */
if (unlikely(block_group->ro))
continue;
btrfs_grab_block_group(block_group, delalloc);
- search_start = block_group->key.objectid;
+ ffe_ctl.search_start = block_group->key.objectid;
/*
* this can happen if we end up cycling through all the
@@ -7398,9 +7953,9 @@ search:
}
have_block_group:
- cached = block_group_cache_done(block_group);
- if (unlikely(!cached)) {
- have_caching_bg = true;
+ ffe_ctl.cached = block_group_cache_done(block_group);
+ if (unlikely(!ffe_ctl.cached)) {
+ ffe_ctl.have_caching_bg = true;
ret = cache_block_group(block_group, 0);
BUG_ON(ret < 0);
ret = 0;
@@ -7414,322 +7969,92 @@ have_block_group:
* lets look there
*/
if (last_ptr && use_cluster) {
- struct btrfs_block_group_cache *used_block_group;
- unsigned long aligned_cluster;
- /*
- * the refill lock keeps out other
- * people trying to start a new cluster
- */
- used_block_group = btrfs_lock_cluster(block_group,
- last_ptr,
- delalloc);
- if (!used_block_group)
- goto refill_cluster;
-
- if (used_block_group != block_group &&
- (used_block_group->ro ||
- !block_group_bits(used_block_group, flags)))
- goto release_cluster;
-
- offset = btrfs_alloc_from_cluster(used_block_group,
- last_ptr,
- num_bytes,
- used_block_group->key.objectid,
- &max_extent_size);
- if (offset) {
- /* we have a block, we're done */
- spin_unlock(&last_ptr->refill_lock);
- trace_btrfs_reserve_extent_cluster(
- used_block_group,
- search_start, num_bytes);
- if (used_block_group != block_group) {
- btrfs_release_block_group(block_group,
- delalloc);
- block_group = used_block_group;
- }
- goto checks;
- }
-
- WARN_ON(last_ptr->block_group != used_block_group);
-release_cluster:
- /* If we are on LOOP_NO_EMPTY_SIZE, we can't
- * set up a new clusters, so lets just skip it
- * and let the allocator find whatever block
- * it can find. If we reach this point, we
- * will have tried the cluster allocator
- * plenty of times and not have found
- * anything, so we are likely way too
- * fragmented for the clustering stuff to find
- * anything.
- *
- * However, if the cluster is taken from the
- * current block group, release the cluster
- * first, so that we stand a better chance of
- * succeeding in the unclustered
- * allocation. */
- if (loop >= LOOP_NO_EMPTY_SIZE &&
- used_block_group != block_group) {
- spin_unlock(&last_ptr->refill_lock);
- btrfs_release_block_group(used_block_group,
- delalloc);
- goto unclustered_alloc;
- }
+ struct btrfs_block_group_cache *cluster_bg = NULL;
- /*
- * this cluster didn't work out, free it and
- * start over
- */
- btrfs_return_cluster_to_free_space(NULL, last_ptr);
-
- if (used_block_group != block_group)
- btrfs_release_block_group(used_block_group,
- delalloc);
-refill_cluster:
- if (loop >= LOOP_NO_EMPTY_SIZE) {
- spin_unlock(&last_ptr->refill_lock);
- goto unclustered_alloc;
- }
-
- aligned_cluster = max_t(unsigned long,
- empty_cluster + empty_size,
- block_group->full_stripe_len);
+ ret = find_free_extent_clustered(block_group, last_ptr,
+ &ffe_ctl, &cluster_bg);
- /* allocate a cluster in this block group */
- ret = btrfs_find_space_cluster(fs_info, block_group,
- last_ptr, search_start,
- num_bytes,
- aligned_cluster);
if (ret == 0) {
- /*
- * now pull our allocation out of this
- * cluster
- */
- offset = btrfs_alloc_from_cluster(block_group,
- last_ptr,
- num_bytes,
- search_start,
- &max_extent_size);
- if (offset) {
- /* we found one, proceed */
- spin_unlock(&last_ptr->refill_lock);
- trace_btrfs_reserve_extent_cluster(
- block_group, search_start,
- num_bytes);
- goto checks;
+ if (cluster_bg && cluster_bg != block_group) {
+ btrfs_release_block_group(block_group,
+ delalloc);
+ block_group = cluster_bg;
}
- } else if (!cached && loop > LOOP_CACHING_NOWAIT
- && !failed_cluster_refill) {
- spin_unlock(&last_ptr->refill_lock);
-
- failed_cluster_refill = true;
- wait_block_group_cache_progress(block_group,
- num_bytes + empty_cluster + empty_size);
+ goto checks;
+ } else if (ret == -EAGAIN) {
goto have_block_group;
- }
-
- /*
- * at this point we either didn't find a cluster
- * or we weren't able to allocate a block from our
- * cluster. Free the cluster we've been trying
- * to use, and go to the next block group
- */
- btrfs_return_cluster_to_free_space(NULL, last_ptr);
- spin_unlock(&last_ptr->refill_lock);
- goto loop;
- }
-
-unclustered_alloc:
- /*
- * We are doing an unclustered alloc, set the fragmented flag so
- * we don't bother trying to setup a cluster again until we get
- * more space.
- */
- if (unlikely(last_ptr)) {
- spin_lock(&last_ptr->lock);
- last_ptr->fragmented = 1;
- spin_unlock(&last_ptr->lock);
- }
- if (cached) {
- struct btrfs_free_space_ctl *ctl =
- block_group->free_space_ctl;
-
- spin_lock(&ctl->tree_lock);
- if (ctl->free_space <
- num_bytes + empty_cluster + empty_size) {
- max_free_space = max(max_free_space,
- ctl->free_space);
- spin_unlock(&ctl->tree_lock);
+ } else if (ret > 0) {
goto loop;
}
- spin_unlock(&ctl->tree_lock);
+ /* ret == -ENOENT case falls through */
}
- offset = btrfs_find_space_for_alloc(block_group, search_start,
- num_bytes, empty_size,
- &max_extent_size);
- /*
- * If we didn't find a chunk, and we haven't failed on this
- * block group before, and this block group is in the middle of
- * caching and we are ok with waiting, then go ahead and wait
- * for progress to be made, and set failed_alloc to true.
- *
- * If failed_alloc is true then we've already waited on this
- * block group once and should move on to the next block group.
- */
- if (!offset && !failed_alloc && !cached &&
- loop > LOOP_CACHING_NOWAIT) {
- wait_block_group_cache_progress(block_group,
- num_bytes + empty_size);
- failed_alloc = true;
+ ret = find_free_extent_unclustered(block_group, last_ptr,
+ &ffe_ctl);
+ if (ret == -EAGAIN)
goto have_block_group;
- } else if (!offset) {
+ else if (ret > 0)
goto loop;
- }
+ /* ret == 0 case falls through */
checks:
- search_start = round_up(offset, fs_info->stripesize);
+ ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
+ fs_info->stripesize);
/* move on to the next group */
- if (search_start + num_bytes >
+ if (ffe_ctl.search_start + num_bytes >
block_group->key.objectid + block_group->key.offset) {
- btrfs_add_free_space(block_group, offset, num_bytes);
+ btrfs_add_free_space(block_group, ffe_ctl.found_offset,
+ num_bytes);
goto loop;
}
- if (offset < search_start)
- btrfs_add_free_space(block_group, offset,
- search_start - offset);
+ if (ffe_ctl.found_offset < ffe_ctl.search_start)
+ btrfs_add_free_space(block_group, ffe_ctl.found_offset,
+ ffe_ctl.search_start - ffe_ctl.found_offset);
ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
num_bytes, delalloc);
if (ret == -EAGAIN) {
- btrfs_add_free_space(block_group, offset, num_bytes);
+ btrfs_add_free_space(block_group, ffe_ctl.found_offset,
+ num_bytes);
goto loop;
}
btrfs_inc_block_group_reservations(block_group);
/* we are all good, lets return */
- ins->objectid = search_start;
+ ins->objectid = ffe_ctl.search_start;
ins->offset = num_bytes;
- trace_btrfs_reserve_extent(block_group, search_start, num_bytes);
+ trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
+ num_bytes);
btrfs_release_block_group(block_group, delalloc);
break;
loop:
- failed_cluster_refill = false;
- failed_alloc = false;
+ ffe_ctl.retry_clustered = false;
+ ffe_ctl.retry_unclustered = false;
BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
- index);
+ ffe_ctl.index);
btrfs_release_block_group(block_group, delalloc);
cond_resched();
}
up_read(&space_info->groups_sem);
- if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
- && !orig_have_caching_bg)
- orig_have_caching_bg = true;
-
- if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
- goto search;
-
- if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
+ ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
+ full_search, use_cluster);
+ if (ret > 0)
goto search;
- /*
- * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
- * caching kthreads as we move along
- * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
- * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
- * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
- * again
- */
- if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
- index = 0;
- if (loop == LOOP_CACHING_NOWAIT) {
- /*
- * We want to skip the LOOP_CACHING_WAIT step if we
- * don't have any uncached bgs and we've already done a
- * full search through.
- */
- if (orig_have_caching_bg || !full_search)
- loop = LOOP_CACHING_WAIT;
- else
- loop = LOOP_ALLOC_CHUNK;
- } else {
- loop++;
- }
-
- if (loop == LOOP_ALLOC_CHUNK) {
- struct btrfs_trans_handle *trans;
- int exist = 0;
-
- trans = current->journal_info;
- if (trans)
- exist = 1;
- else
- trans = btrfs_join_transaction(root);
-
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- goto out;
- }
-
- ret = do_chunk_alloc(trans, flags, CHUNK_ALLOC_FORCE);
-
- /*
- * If we can't allocate a new chunk we've already looped
- * through at least once, move on to the NO_EMPTY_SIZE
- * case.
- */
- if (ret == -ENOSPC)
- loop = LOOP_NO_EMPTY_SIZE;
-
- /*
- * Do not bail out on ENOSPC since we
- * can do more things.
- */
- if (ret < 0 && ret != -ENOSPC)
- btrfs_abort_transaction(trans, ret);
- else
- ret = 0;
- if (!exist)
- btrfs_end_transaction(trans);
- if (ret)
- goto out;
- }
-
- if (loop == LOOP_NO_EMPTY_SIZE) {
- /*
- * Don't loop again if we already have no empty_size and
- * no empty_cluster.
- */
- if (empty_size == 0 &&
- empty_cluster == 0) {
- ret = -ENOSPC;
- goto out;
- }
- empty_size = 0;
- empty_cluster = 0;
- }
-
- goto search;
- } else if (!ins->objectid) {
- ret = -ENOSPC;
- } else if (ins->objectid) {
- if (!use_cluster && last_ptr) {
- spin_lock(&last_ptr->lock);
- last_ptr->window_start = ins->objectid;
- spin_unlock(&last_ptr->lock);
- }
- ret = 0;
- }
-out:
if (ret == -ENOSPC) {
- if (!max_extent_size)
- max_extent_size = max_free_space;
+ /*
+ * Use ffe_ctl->total_free_space as fallback if we can't find
+ * any contiguous hole.
+ */
+ if (!ffe_ctl.max_extent_size)
+ ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
spin_lock(&space_info->lock);
- space_info->max_extent_size = max_extent_size;
+ space_info->max_extent_size = ffe_ctl.max_extent_size;
spin_unlock(&space_info->lock);
- ins->offset = max_extent_size;
+ ins->offset = ffe_ctl.max_extent_size;
}
return ret;
}
@@ -8169,13 +8494,13 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_set_header_generation(buf, trans->transid);
btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
btrfs_set_header_owner(buf, owner);
- write_extent_buffer_fsid(buf, fs_info->fsid);
+ write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
buf->log_index = root->log_transid % 2;
/*
* we allow two log transactions at a time, use different
- * EXENT bit to differentiate dirty pages.
+ * EXTENT bit to differentiate dirty pages.
*/
if (buf->log_index == 0)
set_extent_dirty(&root->dirty_log_pages, buf->start,
@@ -8221,7 +8546,12 @@ again:
goto again;
}
- if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+ /*
+ * The global reserve still exists to save us from ourselves, so don't
+ * warn_on if we are short on our delayed refs reserve.
+ */
+ if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
+ btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
static DEFINE_RATELIMIT_STATE(_rs,
DEFAULT_RATELIMIT_INTERVAL * 10,
/*DEFAULT_RATELIMIT_BURST*/ 1);
@@ -8544,7 +8874,6 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
u64 bytenr;
u64 generation;
u64 parent;
- u32 blocksize;
struct btrfs_key key;
struct btrfs_key first_key;
struct extent_buffer *next;
@@ -8569,7 +8898,6 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
btrfs_node_key_to_cpu(path->nodes[level], &first_key,
path->slots[level]);
- blocksize = fs_info->nodesize;
next = find_extent_buffer(fs_info, bytenr);
if (!next) {
@@ -8693,7 +9021,7 @@ skip:
ret);
}
}
- ret = btrfs_free_extent(trans, root, bytenr, blocksize,
+ ret = btrfs_free_extent(trans, root, bytenr, fs_info->nodesize,
parent, root->root_key.objectid,
level - 1, 0);
if (ret)
@@ -8944,9 +9272,22 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
goto out_free;
}
+ err = btrfs_run_delayed_items(trans);
+ if (err)
+ goto out_end_trans;
+
if (block_rsv)
trans->block_rsv = block_rsv;
+ /*
+ * This will help us catch people modifying the fs tree while we're
+ * dropping it. It is unsafe to mess with the fs tree while it's being
+ * dropped as we unlock the root node and parent nodes as we walk down
+ * the tree, assuming nothing will change. If something does change
+ * then we'll have stale information and drop references to blocks we've
+ * already dropped.
+ */
+ set_bit(BTRFS_ROOT_DELETING, &root->state);
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_header_level(root->node);
path->nodes[level] = btrfs_lock_root_node(root);
@@ -9421,7 +9762,7 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
}
/*
- * checks to see if its even possible to relocate this block group.
+ * Checks to see if it's even possible to relocate this block group.
*
* @return - -1 if it's not a good idea to relocate this block group, 0 if its
* ok to go ahead and try.
@@ -10049,7 +10390,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
* check for two cases, either we are full, and therefore
* don't need to bother with the caching work since we won't
* find any space, or we are empty, and we can just add all
- * the space in and be done with it. This saves us _alot_ of
+ * the space in and be done with it. This saves us _a_lot_ of
* time, particularly in the full case.
*/
if (found_key.offset == btrfs_block_group_used(&cache->item)) {
@@ -10154,6 +10495,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
add_block_group_free_space(trans, block_group);
/* already aborted the transaction if it failed. */
next:
+ btrfs_delayed_refs_rsv_release(fs_info, 1);
list_del_init(&block_group->bg_list);
}
btrfs_trans_release_chunk_metadata(trans);
@@ -10231,6 +10573,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
link_block_group(cache);
list_add_tail(&cache->bg_list, &trans->new_bgs);
+ trans->delayed_ref_updates++;
+ btrfs_update_delayed_refs_rsv(trans);
set_avail_alloc_bits(fs_info, type);
return 0;
@@ -10268,6 +10612,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
int factor;
struct btrfs_caching_control *caching_ctl = NULL;
bool remove_em;
+ bool remove_rsv = false;
block_group = btrfs_lookup_block_group(fs_info, group_start);
BUG_ON(!block_group);
@@ -10315,7 +10660,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
mutex_lock(&trans->transaction->cache_write_mutex);
/*
- * make sure our free spache cache IO is done before remove the
+ * Make sure our free space cache IO is done before removing the
* free space inode
*/
spin_lock(&trans->transaction->dirty_bgs_lock);
@@ -10332,6 +10677,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
if (!list_empty(&block_group->dirty_list)) {
list_del_init(&block_group->dirty_list);
+ remove_rsv = true;
btrfs_put_block_group(block_group);
}
spin_unlock(&trans->transaction->dirty_bgs_lock);
@@ -10541,6 +10887,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
ret = btrfs_del_item(trans, root, path);
out:
+ if (remove_rsv)
+ btrfs_delayed_refs_rsv_release(fs_info, 1);
btrfs_free_path(path);
return ret;
}
@@ -10698,7 +11046,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
- space_info->bytes_pinned -= block_group->pinned;
+ update_bytes_pinned(space_info, -block_group->pinned);
space_info->bytes_readonly += block_group->pinned;
percpu_counter_add_batch(&space_info->total_bytes_pinned,
-block_group->pinned,
@@ -10829,7 +11177,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
if (!blk_queue_discard(bdev_get_queue(device->bdev)))
return 0;
- /* Not writeable = nothing to do. */
+ /* Not writable = nothing to do. */
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
return 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d228f70..52abe40 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -89,9 +89,18 @@ void btrfs_leak_debug_check(void)
static inline void __btrfs_debug_check_extent_io_range(const char *caller,
struct extent_io_tree *tree, u64 start, u64 end)
{
- if (tree->ops && tree->ops->check_extent_io_range)
- tree->ops->check_extent_io_range(tree->private_data, caller,
- start, end);
+ struct inode *inode = tree->private_data;
+ u64 isize;
+
+ if (!inode || !is_data_inode(inode))
+ return;
+
+ isize = i_size_read(inode);
+ if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
+ btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
+ "%s: ino %llu isize %llu odd range [%llu,%llu]",
+ caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
+ }
}
#else
#define btrfs_leak_debug_add(new, head) do {} while (0)
@@ -344,13 +353,6 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree,
return tree_search_for_insert(tree, offset, NULL, NULL);
}
-static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
- struct extent_state *other)
-{
- if (tree->ops && tree->ops->merge_extent_hook)
- tree->ops->merge_extent_hook(tree->private_data, new, other);
-}
-
/*
* utility function to look for merge candidates inside a given range.
* Any extents with matching state are merged together into a single
@@ -374,7 +376,10 @@ static void merge_state(struct extent_io_tree *tree,
other = rb_entry(other_node, struct extent_state, rb_node);
if (other->end == state->start - 1 &&
other->state == state->state) {
- merge_cb(tree, state, other);
+ if (tree->private_data &&
+ is_data_inode(tree->private_data))
+ btrfs_merge_delalloc_extent(tree->private_data,
+ state, other);
state->start = other->start;
rb_erase(&other->rb_node, &tree->state);
RB_CLEAR_NODE(&other->rb_node);
@@ -386,7 +391,10 @@ static void merge_state(struct extent_io_tree *tree,
other = rb_entry(other_node, struct extent_state, rb_node);
if (other->start == state->end + 1 &&
other->state == state->state) {
- merge_cb(tree, state, other);
+ if (tree->private_data &&
+ is_data_inode(tree->private_data))
+ btrfs_merge_delalloc_extent(tree->private_data,
+ state, other);
state->end = other->end;
rb_erase(&other->rb_node, &tree->state);
RB_CLEAR_NODE(&other->rb_node);
@@ -395,20 +403,6 @@ static void merge_state(struct extent_io_tree *tree,
}
}
-static void set_state_cb(struct extent_io_tree *tree,
- struct extent_state *state, unsigned *bits)
-{
- if (tree->ops && tree->ops->set_bit_hook)
- tree->ops->set_bit_hook(tree->private_data, state, bits);
-}
-
-static void clear_state_cb(struct extent_io_tree *tree,
- struct extent_state *state, unsigned *bits)
-{
- if (tree->ops && tree->ops->clear_bit_hook)
- tree->ops->clear_bit_hook(tree->private_data, state, bits);
-}
-
static void set_state_bits(struct extent_io_tree *tree,
struct extent_state *state, unsigned *bits,
struct extent_changeset *changeset);
@@ -451,13 +445,6 @@ static int insert_state(struct extent_io_tree *tree,
return 0;
}
-static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
- u64 split)
-{
- if (tree->ops && tree->ops->split_extent_hook)
- tree->ops->split_extent_hook(tree->private_data, orig, split);
-}
-
/*
* split a given extent state struct in two, inserting the preallocated
* struct 'prealloc' as the newly created second half. 'split' indicates an
@@ -477,7 +464,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
{
struct rb_node *node;
- split_cb(tree, orig, split);
+ if (tree->private_data && is_data_inode(tree->private_data))
+ btrfs_split_delalloc_extent(tree->private_data, orig, split);
prealloc->start = orig->start;
prealloc->end = split - 1;
@@ -504,7 +492,7 @@ static struct extent_state *next_state(struct extent_state *state)
/*
* utility function to clear some bits in an extent state struct.
- * it will optionally wake up any one waiting on this state (wake == 1).
+ * it will optionally wake up anyone waiting on this state (wake == 1).
*
* If no bits are set on the state struct after clearing things, the
* struct is freed and removed from the tree
@@ -523,7 +511,10 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
WARN_ON(range > tree->dirty_bytes);
tree->dirty_bytes -= range;
}
- clear_state_cb(tree, state, bits);
+
+ if (tree->private_data && is_data_inode(tree->private_data))
+ btrfs_clear_delalloc_extent(tree->private_data, state, bits);
+
ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
BUG_ON(ret < 0);
state->state &= ~bits_to_clear;
@@ -800,7 +791,9 @@ static void set_state_bits(struct extent_io_tree *tree,
unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
int ret;
- set_state_cb(tree, state, bits);
+ if (tree->private_data && is_data_inode(tree->private_data))
+ btrfs_set_delalloc_extent(tree->private_data, state, bits);
+
if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
tree->dirty_bytes += range;
@@ -1459,16 +1452,16 @@ out:
* find a contiguous range of bytes in the file marked as delalloc, not
* more than 'max_bytes'. start and end are used to return the range,
*
- * 1 is returned if we find something, 0 if nothing was in the tree
+ * true is returned if we find something, false if nothing was in the tree
*/
-static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
+static noinline bool find_delalloc_range(struct extent_io_tree *tree,
u64 *start, u64 *end, u64 max_bytes,
struct extent_state **cached_state)
{
struct rb_node *node;
struct extent_state *state;
u64 cur_start = *start;
- u64 found = 0;
+ bool found = false;
u64 total_bytes = 0;
spin_lock(&tree->lock);
@@ -1479,8 +1472,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
*/
node = tree_search(tree, cur_start);
if (!node) {
- if (!found)
- *end = (u64)-1;
+ *end = (u64)-1;
goto out;
}
@@ -1500,7 +1492,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
*cached_state = state;
refcount_inc(&state->refs);
}
- found++;
+ found = true;
*end = state->end;
cur_start = state->end + 1;
node = rb_next(node);
@@ -1558,19 +1550,22 @@ static noinline int lock_delalloc_pages(struct inode *inode,
}
/*
- * find a contiguous range of bytes in the file marked as delalloc, not
- * more than 'max_bytes'. start and end are used to return the range,
+ * Find and lock a contiguous range of bytes in the file marked as delalloc, no
+ * more than @max_bytes. @Start and @end are used to return the range,
*
- * 1 is returned if we find something, 0 if nothing was in the tree
+ * Return: true if we find something
+ * false if nothing was in the tree
*/
-static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode,
+EXPORT_FOR_TESTS
+noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree,
struct page *locked_page, u64 *start,
- u64 *end, u64 max_bytes)
+ u64 *end)
{
+ u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
u64 delalloc_start;
u64 delalloc_end;
- u64 found;
+ bool found;
struct extent_state *cached_state = NULL;
int ret;
int loops = 0;
@@ -1585,7 +1580,7 @@ again:
*start = delalloc_start;
*end = delalloc_end;
free_extent_state(cached_state);
- return 0;
+ return false;
}
/*
@@ -1605,6 +1600,7 @@ again:
/* step two, lock all the pages after the page that has start */
ret = lock_delalloc_pages(inode, locked_page,
delalloc_start, delalloc_end);
+ ASSERT(!ret || ret == -EAGAIN);
if (ret == -EAGAIN) {
/* some of the pages are gone, lets avoid looping by
* shortening the size of the delalloc range we're searching
@@ -1616,11 +1612,10 @@ again:
loops = 1;
goto again;
} else {
- found = 0;
+ found = false;
goto out_failed;
}
}
- BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
/* step three, lock the state bits for the whole range */
lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
@@ -1643,17 +1638,6 @@ out_failed:
return found;
}
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-u64 btrfs_find_lock_delalloc_range(struct inode *inode,
- struct extent_io_tree *tree,
- struct page *locked_page, u64 *start,
- u64 *end, u64 max_bytes)
-{
- return find_lock_delalloc_range(inode, tree, locked_page, start, end,
- max_bytes);
-}
-#endif
-
static int __process_pages_contig(struct address_space *mapping,
struct page *locked_page,
pgoff_t start_index, pgoff_t end_index,
@@ -2349,13 +2333,11 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
}
/*
- * this is a generic handler for readpage errors (default
- * readpage_io_failed_hook). if other copies exist, read those and write back
- * good data to the failed position. does not investigate in remapping the
- * failed extent elsewhere, hoping the device will be smart enough to do this as
- * needed
+ * This is a generic handler for readpage errors. If other copies exist, read
+ * those and write back good data to the failed position. Does not investigate
+ * in remapping the failed extent elsewhere, hoping the device will be smart
+ * enough to do this as needed
*/
-
static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
struct page *page, u64 start, u64 end,
int failed_mirror)
@@ -2412,14 +2394,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
{
int uptodate = (err == 0);
- struct extent_io_tree *tree;
int ret = 0;
- tree = &BTRFS_I(page->mapping->host)->io_tree;
-
- if (tree->ops && tree->ops->writepage_end_io_hook)
- tree->ops->writepage_end_io_hook(page, start, end, NULL,
- uptodate);
+ btrfs_writepage_endio_finish_ordered(page, start, end, uptodate);
if (!uptodate) {
ClearPageUptodate(page);
@@ -2522,6 +2499,8 @@ static void end_bio_extent_readpage(struct bio *bio)
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ bool data_inode = btrfs_ino(BTRFS_I(inode))
+ != BTRFS_BTREE_INODE_OBJECTID;
btrfs_debug(fs_info,
"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
@@ -2551,7 +2530,7 @@ static void end_bio_extent_readpage(struct bio *bio)
len = bvec->bv_len;
mirror = io_bio->mirror_num;
- if (likely(uptodate && tree->ops)) {
+ if (likely(uptodate)) {
ret = tree->ops->readpage_end_io_hook(io_bio, offset,
page, start, end,
mirror);
@@ -2567,38 +2546,37 @@ static void end_bio_extent_readpage(struct bio *bio)
if (likely(uptodate))
goto readpage_ok;
- if (tree->ops) {
- ret = tree->ops->readpage_io_failed_hook(page, mirror);
- if (ret == -EAGAIN) {
- /*
- * Data inode's readpage_io_failed_hook() always
- * returns -EAGAIN.
- *
- * The generic bio_readpage_error handles errors
- * the following way: If possible, new read
- * requests are created and submitted and will
- * end up in end_bio_extent_readpage as well (if
- * we're lucky, not in the !uptodate case). In
- * that case it returns 0 and we just go on with
- * the next page in our bio. If it can't handle
- * the error it will return -EIO and we remain
- * responsible for that page.
- */
- ret = bio_readpage_error(bio, offset, page,
- start, end, mirror);
- if (ret == 0) {
- uptodate = !bio->bi_status;
- offset += len;
- continue;
- }
- }
+ if (data_inode) {
/*
- * metadata's readpage_io_failed_hook() always returns
- * -EIO and fixes nothing. -EIO is also returned if
- * data inode error could not be fixed.
+ * The generic bio_readpage_error handles errors the
+ * following way: If possible, new read requests are
+ * created and submitted and will end up in
+ * end_bio_extent_readpage as well (if we're lucky,
+ * not in the !uptodate case). In that case it returns
+ * 0 and we just go on with the next page in our bio.
+ * If it can't handle the error it will return -EIO and
+ * we remain responsible for that page.
*/
- ASSERT(ret == -EIO);
+ ret = bio_readpage_error(bio, offset, page, start, end,
+ mirror);
+ if (ret == 0) {
+ uptodate = !bio->bi_status;
+ offset += len;
+ continue;
+ }
+ } else {
+ struct extent_buffer *eb;
+
+ eb = (struct extent_buffer *)page->private;
+ set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
+ eb->read_mirror = mirror;
+ atomic_dec(&eb->io_pages);
+ if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD,
+ &eb->bflags))
+ btree_readahead_hook(eb, -EIO);
+
+ ret = -EIO;
}
readpage_ok:
if (likely(uptodate)) {
@@ -2607,7 +2585,7 @@ readpage_ok:
unsigned off;
/* Zero out the end if this page straddles i_size */
- off = i_size & (PAGE_SIZE-1);
+ off = offset_in_page(i_size);
if (page->index == end_index && off)
zero_user_segment(page, off, PAGE_SIZE);
SetPageUptodate(page);
@@ -2644,8 +2622,7 @@ readpage_ok:
if (extent_len)
endio_readpage_release_extent(tree, extent_start, extent_len,
uptodate);
- if (io_bio->end_io)
- io_bio->end_io(io_bio, blk_status_to_errno(bio->bi_status));
+ btrfs_io_bio_free_csum(io_bio);
bio_put(bio);
}
@@ -2782,8 +2759,8 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
else
contig = bio_end_sector(bio) == sector;
- if (tree->ops && btrfs_merge_bio_hook(page, offset, page_size,
- bio, bio_flags))
+ ASSERT(tree->ops);
+ if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags))
can_merge = false;
if (prev_bio_flags != bio_flags || !contig || !can_merge ||
@@ -2911,7 +2888,7 @@ static int __do_readpage(struct extent_io_tree *tree,
if (page->index == last_byte >> PAGE_SHIFT) {
char *userpage;
- size_t zero_offset = last_byte & (PAGE_SIZE - 1);
+ size_t zero_offset = offset_in_page(last_byte);
if (zero_offset) {
iosize = PAGE_SIZE - zero_offset;
@@ -3205,7 +3182,7 @@ static void update_nr_written(struct writeback_control *wbc,
/*
* helper for __extent_writepage, doing all of the delayed allocation setup.
*
- * This returns 1 if our fill_delalloc function did all the work required
+ * This returns 1 if btrfs_run_delalloc_range function did all the work required
* to write the page (copy into inline extent). In this case the IO has
* been started and the page is already unlocked.
*
@@ -3213,44 +3190,37 @@ static void update_nr_written(struct writeback_control *wbc,
* This returns < 0 if there were errors (page still locked)
*/
static noinline_for_stack int writepage_delalloc(struct inode *inode,
- struct page *page, struct writeback_control *wbc,
- struct extent_page_data *epd,
- u64 delalloc_start,
- unsigned long *nr_written)
+ struct page *page, struct writeback_control *wbc,
+ u64 delalloc_start, unsigned long *nr_written)
{
- struct extent_io_tree *tree = epd->tree;
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
u64 page_end = delalloc_start + PAGE_SIZE - 1;
- u64 nr_delalloc;
+ bool found;
u64 delalloc_to_write = 0;
u64 delalloc_end = 0;
int ret;
int page_started = 0;
- if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc)
- return 0;
while (delalloc_end < page_end) {
- nr_delalloc = find_lock_delalloc_range(inode, tree,
+ found = find_lock_delalloc_range(inode, tree,
page,
&delalloc_start,
- &delalloc_end,
- BTRFS_MAX_EXTENT_SIZE);
- if (nr_delalloc == 0) {
+ &delalloc_end);
+ if (!found) {
delalloc_start = delalloc_end + 1;
continue;
}
- ret = tree->ops->fill_delalloc(inode, page,
- delalloc_start,
- delalloc_end,
- &page_started,
- nr_written, wbc);
+ ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
+ delalloc_end, &page_started, nr_written, wbc);
/* File system has been set read-only */
if (ret) {
SetPageError(page);
- /* fill_delalloc should be return < 0 for error
- * but just in case, we use > 0 here meaning the
- * IO is started, so we don't want to return > 0
- * unless things are going well.
+ /*
+ * btrfs_run_delalloc_range should return < 0 for error
+ * but just in case, we use > 0 here meaning the IO is
+ * started, so we don't want to return > 0 unless
+ * things are going well.
*/
ret = ret < 0 ? ret : -EIO;
goto done;
@@ -3323,20 +3293,17 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
int nr = 0;
bool compressed;
- if (tree->ops && tree->ops->writepage_start_hook) {
- ret = tree->ops->writepage_start_hook(page, start,
- page_end);
- if (ret) {
- /* Fixup worker will requeue */
- if (ret == -EBUSY)
- wbc->pages_skipped++;
- else
- redirty_page_for_writepage(wbc, page);
+ ret = btrfs_writepage_cow_fixup(page, start, page_end);
+ if (ret) {
+ /* Fixup worker will requeue */
+ if (ret == -EBUSY)
+ wbc->pages_skipped++;
+ else
+ redirty_page_for_writepage(wbc, page);
- update_nr_written(wbc, nr_written);
- unlock_page(page);
- return 1;
- }
+ update_nr_written(wbc, nr_written);
+ unlock_page(page);
+ return 1;
}
/*
@@ -3347,9 +3314,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
end = page_end;
if (i_size <= start) {
- if (tree->ops && tree->ops->writepage_end_io_hook)
- tree->ops->writepage_end_io_hook(page, start,
- page_end, NULL, 1);
+ btrfs_writepage_endio_finish_ordered(page, start, page_end, 1);
goto done;
}
@@ -3360,9 +3325,8 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
u64 offset;
if (cur >= i_size) {
- if (tree->ops && tree->ops->writepage_end_io_hook)
- tree->ops->writepage_end_io_hook(page, cur,
- page_end, NULL, 1);
+ btrfs_writepage_endio_finish_ordered(page, cur,
+ page_end, 1);
break;
}
em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur,
@@ -3396,11 +3360,10 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
* end_io notification does not happen here for
* compressed extents
*/
- if (!compressed && tree->ops &&
- tree->ops->writepage_end_io_hook)
- tree->ops->writepage_end_io_hook(page, cur,
- cur + iosize - 1,
- NULL, 1);
+ if (!compressed)
+ btrfs_writepage_endio_finish_ordered(page, cur,
+ cur + iosize - 1,
+ 1);
else if (compressed) {
/* we don't want to end_page_writeback on
* a compressed extent. this happens
@@ -3469,7 +3432,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
ClearPageError(page);
- pg_offset = i_size & (PAGE_SIZE - 1);
+ pg_offset = offset_in_page(i_size);
if (page->index > end_index ||
(page->index == end_index && !pg_offset)) {
page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
@@ -3491,11 +3454,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
set_page_extent_mapped(page);
- ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written);
- if (ret == 1)
- goto done_unlocked;
- if (ret)
- goto done;
+ if (!epd->extent_locked) {
+ ret = writepage_delalloc(inode, page, wbc, start, &nr_written);
+ if (ret == 1)
+ goto done_unlocked;
+ if (ret)
+ goto done;
+ }
ret = __extent_writepage_io(inode, page, wbc, epd,
i_size, nr_written, write_flags, &nr);
@@ -3934,12 +3899,25 @@ static int extent_write_cache_pages(struct address_space *mapping,
range_whole = 1;
scanned = 1;
}
- if (wbc->sync_mode == WB_SYNC_ALL)
+
+ /*
+ * We do the tagged writepage as long as the snapshot flush bit is set
+ * and we are the first one who do the filemap_flush() on this inode.
+ *
+ * The nr_to_write == LONG_MAX is needed to make sure other flushers do
+ * not race in and drop the bit.
+ */
+ if (range_whole && wbc->nr_to_write == LONG_MAX &&
+ test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
+ &BTRFS_I(inode)->runtime_flags))
+ wbc->tagged_writepages = 1;
+
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
retry:
- if (wbc->sync_mode == WB_SYNC_ALL)
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, index, end);
done_index = index;
while (!done && !nr_to_write_done && (index <= end) &&
@@ -4084,10 +4062,8 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
if (clear_page_dirty_for_io(page))
ret = __extent_writepage(page, &wbc_writepages, &epd);
else {
- if (tree->ops && tree->ops->writepage_end_io_hook)
- tree->ops->writepage_end_io_hook(page, start,
- start + PAGE_SIZE - 1,
- NULL, 1);
+ btrfs_writepage_endio_finish_ordered(page, start,
+ start + PAGE_SIZE - 1, 1);
unlock_page(page);
}
put_page(page);
@@ -4118,42 +4094,35 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages,
unsigned nr_pages)
{
struct bio *bio = NULL;
- unsigned page_idx;
unsigned long bio_flags = 0;
struct page *pagepool[16];
- struct page *page;
struct extent_map *em_cached = NULL;
struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
int nr = 0;
u64 prev_em_start = (u64)-1;
- for (page_idx = 0; page_idx < nr_pages; page_idx++) {
- page = list_entry(pages->prev, struct page, lru);
+ while (!list_empty(pages)) {
+ for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) {
+ struct page *page = lru_to_page(pages);
- prefetchw(&page->flags);
- list_del(&page->lru);
- if (add_to_page_cache_lru(page, mapping,
- page->index,
- readahead_gfp_mask(mapping))) {
- put_page(page);
- continue;
+ prefetchw(&page->flags);
+ list_del(&page->lru);
+ if (add_to_page_cache_lru(page, mapping, page->index,
+ readahead_gfp_mask(mapping))) {
+ put_page(page);
+ continue;
+ }
+
+ pagepool[nr++] = page;
}
- pagepool[nr++] = page;
- if (nr < ARRAY_SIZE(pagepool))
- continue;
__extent_readpages(tree, pagepool, nr, &em_cached, &bio,
- &bio_flags, &prev_em_start);
- nr = 0;
+ &bio_flags, &prev_em_start);
}
- if (nr)
- __extent_readpages(tree, pagepool, nr, &em_cached, &bio,
- &bio_flags, &prev_em_start);
if (em_cached)
free_extent_map(em_cached);
- BUG_ON(!list_empty(pages));
if (bio)
return submit_one_bio(bio, 0, bio_flags);
return 0;
@@ -4342,7 +4311,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
/*
* Sanity check, extent_fiemap() should have ensured that new
- * fiemap extent won't overlap with cahced one.
+ * fiemap extent won't overlap with cached one.
* Not recoverable.
*
* NOTE: Physical address can overlap, due to compression
@@ -4914,13 +4883,6 @@ again:
check_buffer_tree_ref(eb);
set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
- /*
- * We will free dummy extent buffer's if they come into
- * free_extent_buffer with a ref count of 2, but if we are using this we
- * want the buffers to stay in memory until we're done with them, so
- * bump the ref count again.
- */
- atomic_inc(&eb->refs);
return eb;
free_eb:
btrfs_release_extent_buffer(eb);
@@ -5102,7 +5064,9 @@ void free_extent_buffer(struct extent_buffer *eb)
while (1) {
refs = atomic_read(&eb->refs);
- if (refs <= 3)
+ if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
+ || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
+ refs == 1))
break;
old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
if (old == refs)
@@ -5111,10 +5075,6 @@ void free_extent_buffer(struct extent_buffer *eb)
spin_lock(&eb->refs_lock);
if (atomic_read(&eb->refs) == 2 &&
- test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))
- atomic_dec(&eb->refs);
-
- if (atomic_read(&eb->refs) == 2 &&
test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
!extent_buffer_under_io(eb) &&
test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
@@ -5340,7 +5300,7 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
struct page *page;
char *kaddr;
char *dst = (char *)dstv;
- size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ size_t start_offset = offset_in_page(eb->start);
unsigned long i = (start_offset + start) >> PAGE_SHIFT;
if (start + len > eb->len) {
@@ -5350,7 +5310,7 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
return;
}
- offset = (start_offset + start) & (PAGE_SIZE - 1);
+ offset = offset_in_page(start_offset + start);
while (len > 0) {
page = eb->pages[i];
@@ -5375,14 +5335,14 @@ int read_extent_buffer_to_user(const struct extent_buffer *eb,
struct page *page;
char *kaddr;
char __user *dst = (char __user *)dstv;
- size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ size_t start_offset = offset_in_page(eb->start);
unsigned long i = (start_offset + start) >> PAGE_SHIFT;
int ret = 0;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = (start_offset + start) & (PAGE_SIZE - 1);
+ offset = offset_in_page(start_offset + start);
while (len > 0) {
page = eb->pages[i];
@@ -5413,10 +5373,10 @@ int map_private_extent_buffer(const struct extent_buffer *eb,
char **map, unsigned long *map_start,
unsigned long *map_len)
{
- size_t offset = start & (PAGE_SIZE - 1);
+ size_t offset;
char *kaddr;
struct page *p;
- size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ size_t start_offset = offset_in_page(eb->start);
unsigned long i = (start_offset + start) >> PAGE_SHIFT;
unsigned long end_i = (start_offset + start + min_len - 1) >>
PAGE_SHIFT;
@@ -5453,14 +5413,14 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
struct page *page;
char *kaddr;
char *ptr = (char *)ptrv;
- size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ size_t start_offset = offset_in_page(eb->start);
unsigned long i = (start_offset + start) >> PAGE_SHIFT;
int ret = 0;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = (start_offset + start) & (PAGE_SIZE - 1);
+ offset = offset_in_page(start_offset + start);
while (len > 0) {
page = eb->pages[i];
@@ -5509,13 +5469,13 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
struct page *page;
char *kaddr;
char *src = (char *)srcv;
- size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ size_t start_offset = offset_in_page(eb->start);
unsigned long i = (start_offset + start) >> PAGE_SHIFT;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = (start_offset + start) & (PAGE_SIZE - 1);
+ offset = offset_in_page(start_offset + start);
while (len > 0) {
page = eb->pages[i];
@@ -5539,13 +5499,13 @@ void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start,
size_t offset;
struct page *page;
char *kaddr;
- size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ size_t start_offset = offset_in_page(eb->start);
unsigned long i = (start_offset + start) >> PAGE_SHIFT;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = (start_offset + start) & (PAGE_SIZE - 1);
+ offset = offset_in_page(start_offset + start);
while (len > 0) {
page = eb->pages[i];
@@ -5584,13 +5544,12 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
size_t offset;
struct page *page;
char *kaddr;
- size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
+ size_t start_offset = offset_in_page(dst->start);
unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT;
WARN_ON(src->len != dst_len);
- offset = (start_offset + dst_offset) &
- (PAGE_SIZE - 1);
+ offset = offset_in_page(start_offset + dst_offset);
while (len > 0) {
page = dst->pages[i];
@@ -5626,7 +5585,7 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb,
unsigned long *page_index,
size_t *page_offset)
{
- size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ size_t start_offset = offset_in_page(eb->start);
size_t byte_offset = BIT_BYTE(nr);
size_t offset;
@@ -5638,7 +5597,7 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb,
offset = start_offset + start + byte_offset;
*page_index = offset >> PAGE_SHIFT;
- *page_offset = offset & (PAGE_SIZE - 1);
+ *page_offset = offset_in_page(offset);
}
/**
@@ -5780,7 +5739,7 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
size_t cur;
size_t dst_off_in_page;
size_t src_off_in_page;
- size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
+ size_t start_offset = offset_in_page(dst->start);
unsigned long dst_i;
unsigned long src_i;
@@ -5798,10 +5757,8 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
}
while (len > 0) {
- dst_off_in_page = (start_offset + dst_offset) &
- (PAGE_SIZE - 1);
- src_off_in_page = (start_offset + src_offset) &
- (PAGE_SIZE - 1);
+ dst_off_in_page = offset_in_page(start_offset + dst_offset);
+ src_off_in_page = offset_in_page(start_offset + src_offset);
dst_i = (start_offset + dst_offset) >> PAGE_SHIFT;
src_i = (start_offset + src_offset) >> PAGE_SHIFT;
@@ -5829,7 +5786,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
size_t src_off_in_page;
unsigned long dst_end = dst_offset + len - 1;
unsigned long src_end = src_offset + len - 1;
- size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
+ size_t start_offset = offset_in_page(dst->start);
unsigned long dst_i;
unsigned long src_i;
@@ -5853,10 +5810,8 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
dst_i = (start_offset + dst_end) >> PAGE_SHIFT;
src_i = (start_offset + src_end) >> PAGE_SHIFT;
- dst_off_in_page = (start_offset + dst_end) &
- (PAGE_SIZE - 1);
- src_off_in_page = (start_offset + src_end) &
- (PAGE_SIZE - 1);
+ dst_off_in_page = offset_in_page(start_offset + dst_end);
+ src_off_in_page = offset_in_page(start_offset + src_end);
cur = min_t(unsigned long, len, src_off_in_page + 1);
cur = min(cur, dst_off_in_page + 1);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 369daa5..9673be3 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -37,18 +37,22 @@
#define EXTENT_BIO_COMPRESSED 1
#define EXTENT_BIO_FLAG_SHIFT 16
-/* these are bit numbers for test/set bit */
-#define EXTENT_BUFFER_UPTODATE 0
-#define EXTENT_BUFFER_DIRTY 2
-#define EXTENT_BUFFER_CORRUPT 3
-#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */
-#define EXTENT_BUFFER_TREE_REF 5
-#define EXTENT_BUFFER_STALE 6
-#define EXTENT_BUFFER_WRITEBACK 7
-#define EXTENT_BUFFER_READ_ERR 8 /* read IO error */
-#define EXTENT_BUFFER_UNMAPPED 9
-#define EXTENT_BUFFER_IN_TREE 10
-#define EXTENT_BUFFER_WRITE_ERR 11 /* write IO error */
+enum {
+ EXTENT_BUFFER_UPTODATE,
+ EXTENT_BUFFER_DIRTY,
+ EXTENT_BUFFER_CORRUPT,
+ /* this got triggered by readahead */
+ EXTENT_BUFFER_READAHEAD,
+ EXTENT_BUFFER_TREE_REF,
+ EXTENT_BUFFER_STALE,
+ EXTENT_BUFFER_WRITEBACK,
+ /* read IO error */
+ EXTENT_BUFFER_READ_ERR,
+ EXTENT_BUFFER_UNMAPPED,
+ EXTENT_BUFFER_IN_TREE,
+ /* write IO error */
+ EXTENT_BUFFER_WRITE_ERR,
+};
/* these are flags for __process_pages_contig */
#define PAGE_UNLOCK (1 << 0)
@@ -94,38 +98,13 @@ typedef blk_status_t (extent_submit_bio_start_t)(void *private_data,
struct extent_io_ops {
/*
- * The following callbacks must be allways defined, the function
+ * The following callbacks must be always defined, the function
* pointer will be called unconditionally.
*/
extent_submit_bio_hook_t *submit_bio_hook;
int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset,
struct page *page, u64 start, u64 end,
int mirror);
- int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
-
- /*
- * Optional hooks, called if the pointer is not NULL
- */
- int (*fill_delalloc)(void *private_data, struct page *locked_page,
- u64 start, u64 end, int *page_started,
- unsigned long *nr_written,
- struct writeback_control *wbc);
-
- int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
- void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
- struct extent_state *state, int uptodate);
- void (*set_bit_hook)(void *private_data, struct extent_state *state,
- unsigned *bits);
- void (*clear_bit_hook)(void *private_data,
- struct extent_state *state,
- unsigned *bits);
- void (*merge_extent_hook)(void *private_data,
- struct extent_state *new,
- struct extent_state *other);
- void (*split_extent_hook)(void *private_data,
- struct extent_state *orig, u64 split);
- void (*check_extent_io_range)(void *private_data, const char *caller,
- u64 start, u64 end);
};
struct extent_io_tree {
@@ -353,11 +332,11 @@ static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
}
static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
- u64 end)
+ u64 end, struct extent_state **cached)
{
return clear_extent_bit(tree, start, end,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, NULL);
+ EXTENT_DO_ACCOUNTING, 0, 0, cached);
}
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
@@ -546,10 +525,9 @@ int free_io_failure(struct extent_io_tree *failure_tree,
struct extent_io_tree *io_tree,
struct io_failure_record *rec);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-u64 btrfs_find_lock_delalloc_range(struct inode *inode,
- struct extent_io_tree *tree,
- struct page *locked_page, u64 *start,
- u64 *end, u64 max_bytes);
+bool find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree,
+ struct page *locked_page, u64 *start,
+ u64 *end);
#endif
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 7eea8b6..a042a19 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -475,7 +475,8 @@ static struct extent_map *prev_extent_map(struct extent_map *em)
return container_of(prev, struct extent_map, rb_node);
}
-/* helper for btfs_get_extent. Given an existing extent in the tree,
+/*
+ * Helper for btrfs_get_extent. Given an existing extent in the tree,
* the existing extent is the nearest extent to map_start,
* and an extent that you want to insert, deal with overlap and insert
* the best fitted new extent into the tree.
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 31977ff..ef05a01 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -11,13 +11,20 @@
#define EXTENT_MAP_INLINE ((u64)-2)
#define EXTENT_MAP_DELALLOC ((u64)-1)
-/* bits for the flags field */
-#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
-#define EXTENT_FLAG_COMPRESSED 1
-#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
-#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
-#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
-#define EXTENT_FLAG_FS_MAPPING 6 /* filesystem extent mapping type */
+/* bits for the extent_map::flags field */
+enum {
+ /* this entry not yet on disk, don't free it */
+ EXTENT_FLAG_PINNED,
+ EXTENT_FLAG_COMPRESSED,
+ /* pre-allocated extent */
+ EXTENT_FLAG_PREALLOC,
+ /* Logging this extent */
+ EXTENT_FLAG_LOGGING,
+ /* Filling in a preallocated extent */
+ EXTENT_FLAG_FILLING,
+ /* filesystem extent mapping type */
+ EXTENT_FLAG_FS_MAPPING,
+};
struct extent_map {
struct rb_node rb_node;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index ba74827..920bf3b 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -142,11 +142,6 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
return ret;
}
-static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err)
-{
- kfree(bio->csum_allocated);
-}
-
static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
u64 logical_offset, u32 *dst, int dio)
{
@@ -175,14 +170,12 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio
nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
if (!dst) {
if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
- btrfs_bio->csum_allocated = kmalloc_array(nblocks,
- csum_size, GFP_NOFS);
- if (!btrfs_bio->csum_allocated) {
+ btrfs_bio->csum = kmalloc_array(nblocks, csum_size,
+ GFP_NOFS);
+ if (!btrfs_bio->csum) {
btrfs_free_path(path);
return BLK_STS_RESOURCE;
}
- btrfs_bio->csum = btrfs_bio->csum_allocated;
- btrfs_bio->end_io = btrfs_io_bio_endio_readpage;
} else {
btrfs_bio->csum = btrfs_bio->csum_inline;
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a3c22e1..d38dc8c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -399,7 +399,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
size_t copied = 0;
size_t total_copied = 0;
int pg = 0;
- int offset = pos & (PAGE_SIZE - 1);
+ int offset = offset_in_page(pos);
while (write_bytes > 0) {
size_t count = min_t(size_t,
@@ -1611,7 +1611,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
return -ENOMEM;
while (iov_iter_count(i) > 0) {
- size_t offset = pos & (PAGE_SIZE - 1);
+ size_t offset = offset_in_page(pos);
size_t sector_offset;
size_t write_bytes = min(iov_iter_count(i),
nrptrs * (size_t)PAGE_SIZE -
@@ -2005,7 +2005,7 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
filp->private_data = NULL;
/*
- * ordered_data_close is set by settattr when we are about to truncate
+ * ordered_data_close is set by setattr when we are about to truncate
* a file from a non-zero size to a zero size. This tries to
* flush down new bytes that may have been written if the
* application were using truncate to replace a file in place.
@@ -2089,8 +2089,32 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
atomic_inc(&root->log_batch);
/*
+ * Before we acquired the inode's lock, someone may have dirtied more
+ * pages in the target range. We need to make sure that writeback for
+ * any such pages does not start while we are logging the inode, because
+ * if it does, any of the following might happen when we are not doing a
+ * full inode sync:
+ *
+ * 1) We log an extent after its writeback finishes but before its
+ * checksums are added to the csum tree, leading to -EIO errors
+ * when attempting to read the extent after a log replay.
+ *
+ * 2) We can end up logging an extent before its writeback finishes.
+ * Therefore after the log replay we will have a file extent item
+ * pointing to an unwritten extent (and no data checksums as well).
+ *
+ * So trigger writeback for any eventual new dirty pages and then we
+ * wait for all ordered extents to complete below.
+ */
+ ret = start_ordered_ops(inode, start, end);
+ if (ret) {
+ inode_unlock(inode);
+ goto out;
+ }
+
+ /*
* We have to do this here to avoid the priority inversion of waiting on
- * IO of a lower priority task while holding a transaciton open.
+ * IO of a lower priority task while holding a transaction open.
*/
ret = btrfs_wait_ordered_range(inode, start, len);
if (ret) {
@@ -2130,7 +2154,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* here we could get into a situation where we're waiting on IO to
* happen that is blocked on a transaction trying to commit. With start
* we inc the extwriter counter, so we wait for all extwriters to exit
- * before we start blocking join'ers. This comment is to keep somebody
+ * before we start blocking joiners. This comment is to keep somebody
* from thinking they are super smart and changing this to
* btrfs_join_transaction *cough*Josef*cough*.
*/
@@ -2162,25 +2186,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
- /*
- * If any of the ordered extents had an error, just return it to user
- * space, so that the application knows some writes didn't succeed and
- * can take proper action (retry for e.g.). Blindly committing the
- * transaction in this case, would fool userspace that everything was
- * successful. And we also want to make sure our log doesn't contain
- * file extent items pointing to extents that weren't fully written to -
- * just like in the non fast fsync path, where we check for the ordered
- * operation's error flag before writing to the log tree and return -EIO
- * if any of them had this flag set (btrfs_wait_ordered_range) -
- * therefore we need to check for errors in the ordered operations,
- * which are indicated by ctx.io_err.
- */
- if (ctx.io_err) {
- btrfs_end_transaction(trans);
- ret = ctx.io_err;
- goto out;
- }
-
if (ret != BTRFS_NO_LOG_SYNC) {
if (!ret) {
ret = btrfs_sync_log(trans, root, &ctx);
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index d673659..e508908 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -74,11 +74,11 @@ out:
return ret;
}
-struct btrfs_free_space_info *
-search_free_space_info(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *block_group,
- struct btrfs_path *path, int cow)
+EXPORT_FOR_TESTS
+struct btrfs_free_space_info *search_free_space_info(
+ struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, int cow)
{
struct btrfs_root *root = fs_info->free_space_root;
struct btrfs_key key;
@@ -176,6 +176,7 @@ static void le_bitmap_set(unsigned long *map, unsigned int start, int len)
}
}
+EXPORT_FOR_TESTS
int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path)
@@ -315,6 +316,7 @@ out:
return ret;
}
+EXPORT_FOR_TESTS
int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path)
@@ -487,6 +489,7 @@ out:
return ret;
}
+EXPORT_FOR_TESTS
int free_space_test_bit(struct btrfs_block_group_cache *block_group,
struct btrfs_path *path, u64 offset)
{
@@ -775,6 +778,7 @@ out:
return ret;
}
+EXPORT_FOR_TESTS
int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path, u64 start, u64 size)
@@ -968,6 +972,7 @@ out:
return ret;
}
+EXPORT_FOR_TESTS
int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path, u64 start, u64 size)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9ea4c6f..43eb453 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -27,6 +27,7 @@
#include <linux/uio.h>
#include <linux/magic.h>
#include <linux/iversion.h>
+#include <linux/swap.h>
#include <asm/unaligned.h>
#include "ctree.h"
#include "disk-io.h"
@@ -103,23 +104,23 @@ static void __endio_write_update_ordered(struct inode *inode,
/*
* Cleanup all submitted ordered extents in specified range to handle errors
- * from the fill_dellaloc() callback.
+ * from the btrfs_run_delalloc_range() callback.
*
* NOTE: caller must ensure that when an error happens, it can not call
* extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
* and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
* to be released, which we want to happen only when finishing the ordered
- * extent (btrfs_finish_ordered_io()). Also note that the caller of the
- * fill_delalloc() callback already does proper cleanup for the first page of
- * the range, that is, it invokes the callback writepage_end_io_hook() for the
- * range of the first page.
+ * extent (btrfs_finish_ordered_io()).
*/
static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
- const u64 offset,
- const u64 bytes)
+ struct page *locked_page,
+ u64 offset, u64 bytes)
{
unsigned long index = offset >> PAGE_SHIFT;
unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
+ u64 page_start = page_offset(locked_page);
+ u64 page_end = page_start + PAGE_SIZE - 1;
+
struct page *page;
while (index <= end_index) {
@@ -130,8 +131,18 @@ static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
ClearPagePrivate2(page);
put_page(page);
}
- return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
- bytes - PAGE_SIZE, false);
+
+ /*
+ * In case this page belongs to the delalloc range being instantiated
+ * then skip it, since the first page of a range is going to be
+ * properly cleaned up by the caller of run_delalloc_range
+ */
+ if (page_start >= offset && page_end <= (offset + bytes - 1)) {
+ offset += PAGE_SIZE;
+ bytes -= PAGE_SIZE;
+ }
+
+ return __endio_write_update_ordered(inode, offset, bytes, false);
}
static int btrfs_dirty_inode(struct inode *inode);
@@ -229,7 +240,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
start >> PAGE_SHIFT);
btrfs_set_file_extent_compression(leaf, ei, 0);
kaddr = kmap_atomic(page);
- offset = start & (PAGE_SIZE - 1);
+ offset = offset_in_page(start);
write_extent_buffer(leaf, kaddr + offset, ptr, size);
kunmap_atomic(kaddr);
put_page(page);
@@ -357,7 +368,7 @@ struct async_extent {
struct async_cow {
struct inode *inode;
- struct btrfs_root *root;
+ struct btrfs_fs_info *fs_info;
struct page *locked_page;
u64 start;
u64 end;
@@ -538,8 +549,7 @@ again:
&total_compressed);
if (!ret) {
- unsigned long offset = total_compressed &
- (PAGE_SIZE - 1);
+ unsigned long offset = offset_in_page(total_compressed);
struct page *page = pages[nr_pages - 1];
char *kaddr;
@@ -847,14 +857,13 @@ retry:
ins.offset, async_extent->pages,
async_extent->nr_pages,
async_cow->write_flags)) {
- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct page *p = async_extent->pages[0];
const u64 start = async_extent->start;
const u64 end = start + async_extent->ram_size - 1;
p->mapping = inode->i_mapping;
- tree->ops->writepage_end_io_hook(p, start, end,
- NULL, 0);
+ btrfs_writepage_endio_finish_ordered(p, start, end, 0);
+
p->mapping = NULL;
extent_clear_unlock_delalloc(inode, start, end, end,
NULL, 0,
@@ -1144,13 +1153,11 @@ static noinline void async_cow_submit(struct btrfs_work *work)
{
struct btrfs_fs_info *fs_info;
struct async_cow *async_cow;
- struct btrfs_root *root;
unsigned long nr_pages;
async_cow = container_of(work, struct async_cow, work);
- root = async_cow->root;
- fs_info = root->fs_info;
+ fs_info = async_cow->fs_info;
nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
PAGE_SHIFT;
@@ -1179,7 +1186,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct async_cow *async_cow;
- struct btrfs_root *root = BTRFS_I(inode)->root;
unsigned long nr_pages;
u64 cur_end;
@@ -1189,7 +1195,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
BUG_ON(!async_cow); /* -ENOMEM */
async_cow->inode = igrab(inode);
- async_cow->root = root;
+ async_cow->fs_info = fs_info;
async_cow->locked_page = locked_page;
async_cow->start = start;
async_cow->write_flags = write_flags;
@@ -1372,7 +1378,8 @@ next_slot:
* Do the same check as in btrfs_cross_ref_exist but
* without the unnecessary search.
*/
- if (btrfs_file_extent_generation(leaf, fi) <=
+ if (!nolock &&
+ btrfs_file_extent_generation(leaf, fi) <=
btrfs_root_last_snapshot(&root->root_item))
goto out_check;
if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
@@ -1576,12 +1583,12 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
}
/*
- * extent_io.c call back to do delayed allocation processing
+ * Function to process delayed allocation (create CoW) for ranges which are
+ * being touched for the first time.
*/
-static int run_delalloc_range(void *private_data, struct page *locked_page,
- u64 start, u64 end, int *page_started,
- unsigned long *nr_written,
- struct writeback_control *wbc)
+int btrfs_run_delalloc_range(void *private_data, struct page *locked_page,
+ u64 start, u64 end, int *page_started, unsigned long *nr_written,
+ struct writeback_control *wbc)
{
struct inode *inode = private_data;
int ret;
@@ -1605,14 +1612,14 @@ static int run_delalloc_range(void *private_data, struct page *locked_page,
write_flags);
}
if (ret)
- btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
+ btrfs_cleanup_ordered_extents(inode, locked_page, start,
+ end - start + 1);
return ret;
}
-static void btrfs_split_extent_hook(void *private_data,
- struct extent_state *orig, u64 split)
+void btrfs_split_delalloc_extent(struct inode *inode,
+ struct extent_state *orig, u64 split)
{
- struct inode *inode = private_data;
u64 size;
/* not delalloc, ignore it */
@@ -1625,7 +1632,7 @@ static void btrfs_split_extent_hook(void *private_data,
u64 new_size;
/*
- * See the explanation in btrfs_merge_extent_hook, the same
+ * See the explanation in btrfs_merge_delalloc_extent, the same
* applies here, just in reverse.
*/
new_size = orig->end - split + 1;
@@ -1642,16 +1649,13 @@ static void btrfs_split_extent_hook(void *private_data,
}
/*
- * extent_io.c merge_extent_hook, used to track merged delayed allocation
- * extents so we can keep track of new extents that are just merged onto old
- * extents, such as when we are doing sequential writes, so we can properly
- * account for the metadata space we'll need.
+ * Handle merged delayed allocation extents so we can keep track of new extents
+ * that are just merged onto old extents, such as when we are doing sequential
+ * writes, so we can properly account for the metadata space we'll need.
*/
-static void btrfs_merge_extent_hook(void *private_data,
- struct extent_state *new,
- struct extent_state *other)
+void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
+ struct extent_state *other)
{
- struct inode *inode = private_data;
u64 new_size, old_size;
u32 num_extents;
@@ -1755,15 +1759,12 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
}
/*
- * extent_io.c set_bit_hook, used to track delayed allocation
- * bytes in this file, and to maintain the list of inodes that
- * have pending delalloc work to be done.
+ * Properly track delayed allocation bytes in the inode and to maintain the
+ * list of inodes that have pending delalloc work to be done.
*/
-static void btrfs_set_bit_hook(void *private_data,
- struct extent_state *state, unsigned *bits)
+void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
+ unsigned *bits)
{
- struct inode *inode = private_data;
-
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
@@ -1809,14 +1810,14 @@ static void btrfs_set_bit_hook(void *private_data,
}
/*
- * extent_io.c clear_bit_hook, see set_bit_hook for why
+ * Once a range is no longer delalloc this function ensures that proper
+ * accounting happens.
*/
-static void btrfs_clear_bit_hook(void *private_data,
- struct extent_state *state,
- unsigned *bits)
+void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
+ struct extent_state *state, unsigned *bits)
{
- struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
+ struct btrfs_inode *inode = BTRFS_I(vfs_inode);
+ struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
u64 len = state->end + 1 - state->start;
u32 num_extents = count_max_extents(len);
@@ -1841,7 +1842,7 @@ static void btrfs_clear_bit_hook(void *private_data,
/*
* We don't reserve metadata space for space cache inodes so we
- * don't need to call dellalloc_release_metadata if there is an
+ * don't need to call delalloc_release_metadata if there is an
* error.
*/
if (*bits & EXTENT_CLEAR_META_RESV &&
@@ -1880,16 +1881,21 @@ static void btrfs_clear_bit_hook(void *private_data,
}
/*
- * Merge bio hook, this must check the chunk tree to make sure we don't create
- * bios that span stripes or chunks
+ * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit
+ * in a chunk's stripe. This function ensures that bios do not span a
+ * stripe/chunk
*
- * return 1 if page cannot be merged to bio
- * return 0 if page can be merged to bio
+ * @page - The page we are about to add to the bio
+ * @size - size we want to add to the bio
+ * @bio - bio we want to ensure is smaller than a stripe
+ * @bio_flags - flags of the bio
+ *
+ * return 1 if page cannot be added to the bio
+ * return 0 if page can be added to the bio
* return error otherwise
*/
-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
- size_t size, struct bio *bio,
- unsigned long bio_flags)
+int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
+ unsigned long bio_flags)
{
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -1932,29 +1938,6 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
}
/*
- * in order to insert checksums into the metadata in large chunks,
- * we wait until bio submission time. All the pages in the bio are
- * checksummed and sums are attached onto the ordered extent record.
- *
- * At IO completion time the cums attached on the ordered extent record
- * are inserted into the btree
- */
-blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
- int mirror_num)
-{
- struct inode *inode = private_data;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- blk_status_t ret;
-
- ret = btrfs_map_bio(fs_info, bio, mirror_num, 1);
- if (ret) {
- bio->bi_status = ret;
- bio_endio(bio);
- }
- return ret;
-}
-
-/*
* extent_io.c submission hook. This does the right thing for csum calculation
* on write, or reading the csums from the tree before a read.
*
@@ -2056,7 +2039,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state, int dedupe)
{
- WARN_ON((end & (PAGE_SIZE - 1)) == 0);
+ WARN_ON(PAGE_ALIGNED(end));
return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
extra_bits, cached_state);
}
@@ -2152,7 +2135,7 @@ out_page:
* to fix it up. The async helper will wait for ordered extents, set
* the delalloc bit and make it safe to write the page.
*/
-static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
+int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
{
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -3159,8 +3142,8 @@ static void finish_ordered_fn(struct btrfs_work *work)
btrfs_finish_ordered_io(ordered_extent);
}
-static void btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
- struct extent_state *state, int uptodate)
+void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
+ u64 end, int uptodate)
{
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -3686,6 +3669,21 @@ cache_index:
* inode is not a directory, logging its parent unnecessarily.
*/
BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
+ /*
+ * Similar reasoning for last_link_trans, needs to be set otherwise
+ * for a case like the following:
+ *
+ * mkdir A
+ * touch foo
+ * ln foo A/bar
+ * echo 2 > /proc/sys/vm/drop_caches
+ * fsync foo
+ * <power failure>
+ *
+ * Would result in link bar and directory A not existing after the power
+ * failure.
+ */
+ BTRFS_I(inode)->last_link_trans = BTRFS_I(inode)->last_trans;
path->slots[0]++;
if (inode->i_nlink != 1 ||
@@ -4444,31 +4442,6 @@ out:
return err;
}
-static int truncate_space_check(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 bytes_deleted)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- int ret;
-
- /*
- * This is only used to apply pressure to the enospc system, we don't
- * intend to use this reservation at all.
- */
- bytes_deleted = btrfs_csum_bytes_to_leaves(fs_info, bytes_deleted);
- bytes_deleted *= fs_info->nodesize;
- ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv,
- bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
- if (!ret) {
- trace_btrfs_space_reservation(fs_info, "transaction",
- trans->transid,
- bytes_deleted, 1);
- trans->bytes_reserved += bytes_deleted;
- }
- return ret;
-
-}
-
/*
* Return this if we need to call truncate_block for the last bit of the
* truncate.
@@ -4513,7 +4486,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u64 bytes_deleted = 0;
bool be_nice = false;
bool should_throttle = false;
- bool should_end = false;
BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
@@ -4544,7 +4516,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
/*
* This function is also used to drop the items in the log tree before
* we relog the inode, so if root != BTRFS_I(inode)->root, it means
- * it is used to drop the loged items. So we shouldn't kill the delayed
+ * it is used to drop the logged items. So we shouldn't kill the delayed
* items.
*/
if (min_type == 0 && root == BTRFS_I(inode)->root)
@@ -4726,15 +4698,7 @@ delete:
btrfs_abort_transaction(trans, ret);
break;
}
- if (btrfs_should_throttle_delayed_refs(trans))
- btrfs_async_run_delayed_refs(fs_info,
- trans->delayed_ref_updates * 2,
- trans->transid, 0);
if (be_nice) {
- if (truncate_space_check(trans, root,
- extent_num_bytes)) {
- should_end = true;
- }
if (btrfs_should_throttle_delayed_refs(trans))
should_throttle = true;
}
@@ -4745,7 +4709,7 @@ delete:
if (path->slots[0] == 0 ||
path->slots[0] != pending_del_slot ||
- should_throttle || should_end) {
+ should_throttle) {
if (pending_del_nr) {
ret = btrfs_del_items(trans, root, path,
pending_del_slot,
@@ -4757,23 +4721,24 @@ delete:
pending_del_nr = 0;
}
btrfs_release_path(path);
- if (should_throttle) {
- unsigned long updates = trans->delayed_ref_updates;
- if (updates) {
- trans->delayed_ref_updates = 0;
- ret = btrfs_run_delayed_refs(trans,
- updates * 2);
- if (ret)
- break;
- }
- }
+
/*
- * if we failed to refill our space rsv, bail out
- * and let the transaction restart
+ * We can generate a lot of delayed refs, so we need to
+ * throttle every once and a while and make sure we're
+ * adding enough space to keep up with the work we are
+ * generating. Since we hold a transaction here we
+ * can't flush, and we don't want to FLUSH_LIMIT because
+ * we could have generated too many delayed refs to
+ * actually allocate, so just bail if we're short and
+ * let the normal reservation dance happen higher up.
*/
- if (should_end) {
- ret = -EAGAIN;
- break;
+ if (should_throttle) {
+ ret = btrfs_delayed_refs_rsv_refill(fs_info,
+ BTRFS_RESERVE_NO_FLUSH);
+ if (ret) {
+ ret = -EAGAIN;
+ break;
+ }
}
goto search_again;
} else {
@@ -4799,18 +4764,6 @@ out:
}
btrfs_free_path(path);
-
- if (be_nice && bytes_deleted > SZ_32M && (ret >= 0 || ret == -EAGAIN)) {
- unsigned long updates = trans->delayed_ref_updates;
- int err;
-
- if (updates) {
- trans->delayed_ref_updates = 0;
- err = btrfs_run_delayed_refs(trans, updates * 2);
- if (err)
- ret = err;
- }
- }
return ret;
}
@@ -5155,7 +5108,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
truncate_setsize(inode, newsize);
- /* Disable nonlocked read DIO to avoid the end less truncate */
+ /* Disable nonlocked read DIO to avoid the endless truncate */
btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
inode_dio_wait(inode);
btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
@@ -5333,8 +5286,8 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
* Try to steal from the global reserve if there is space for
* it.
*/
- if (!btrfs_check_space_for_delayed_refs(trans) &&
- !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, false))
+ if (!btrfs_check_space_for_delayed_refs(fs_info) &&
+ !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0))
return trans;
/* If not, commit and try again. */
@@ -6406,14 +6359,19 @@ fail_dir_item:
err = btrfs_del_root_ref(trans, key.objectid,
root->root_key.objectid, parent_ino,
&local_index, name, name_len);
-
+ if (err)
+ btrfs_abort_transaction(trans, err);
} else if (add_backref) {
u64 local_index;
int err;
err = btrfs_del_inode_ref(trans, root, name, name_len,
ino, parent_ino, &local_index);
+ if (err)
+ btrfs_abort_transaction(trans, err);
}
+
+ /* Return the original error code */
return ret;
}
@@ -6625,6 +6583,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
if (err)
goto fail;
}
+ BTRFS_I(inode)->last_link_trans = trans->transid;
d_instantiate(dentry, inode);
ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
true, NULL);
@@ -6652,7 +6611,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
int err = 0;
- int drop_on_err = 0;
u64 objectid = 0;
u64 index = 0;
@@ -6678,7 +6636,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
goto out_fail;
}
- drop_on_err = 1;
/* these must be set before we unlock the inode */
inode->i_op = &btrfs_dir_inode_operations;
inode->i_fop = &btrfs_dir_file_operations;
@@ -6699,7 +6656,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
goto out_fail;
d_instantiate_new(dentry, inode);
- drop_on_err = 0;
out_fail:
btrfs_end_transaction(trans);
@@ -8053,9 +8009,7 @@ static void btrfs_endio_direct_read(struct bio *bio)
dio_bio->bi_status = err;
dio_end_io(dio_bio);
-
- if (io_bio->end_io)
- io_bio->end_io(io_bio, blk_status_to_errno(err));
+ btrfs_io_bio_free_csum(io_bio);
bio_put(bio);
}
@@ -8098,7 +8052,7 @@ static void __endio_write_update_ordered(struct inode *inode,
return;
/*
* Our bio might span multiple ordered extents. In this case
- * we keep goin until we have accounted the whole dio.
+ * we keep going until we have accounted the whole dio.
*/
if (ordered_offset < offset + bytes) {
ordered_bytes = offset + bytes - ordered_offset;
@@ -8408,8 +8362,7 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
if (!ret)
return;
- if (io_bio->end_io)
- io_bio->end_io(io_bio, ret);
+ btrfs_io_bio_free_csum(io_bio);
free_ordered:
/*
@@ -8912,7 +8865,7 @@ again:
/* page is wholly or partially inside EOF */
if (page_start + PAGE_SIZE > size)
- zero_start = size & ~PAGE_MASK;
+ zero_start = offset_in_page(size);
else
zero_start = PAGE_SIZE;
@@ -9157,6 +9110,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->index_cnt = (u64)-1;
ei->dir_index = 0;
ei->last_unlink_trans = 0;
+ ei->last_link_trans = 0;
ei->last_log_commit = 0;
spin_lock_init(&ei->lock);
@@ -9968,7 +9922,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
*/
-static int start_delalloc_inodes(struct btrfs_root *root, int nr)
+static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
{
struct btrfs_inode *binode;
struct inode *inode;
@@ -9996,6 +9950,9 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr)
}
spin_unlock(&root->delalloc_lock);
+ if (snapshot)
+ set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
+ &binode->runtime_flags);
work = btrfs_alloc_delalloc_work(inode);
if (!work) {
iput(inode);
@@ -10029,7 +9986,7 @@ out:
return ret;
}
-int btrfs_start_delalloc_inodes(struct btrfs_root *root)
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
@@ -10037,7 +9994,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
return -EROFS;
- ret = start_delalloc_inodes(root, -1);
+ ret = start_delalloc_inodes(root, -1, true);
if (ret > 0)
ret = 0;
return ret;
@@ -10066,7 +10023,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
&fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
- ret = start_delalloc_inodes(root, nr);
+ ret = start_delalloc_inodes(root, nr, false);
btrfs_put_fs_root(root);
if (ret < 0)
goto out;
@@ -10445,26 +10402,6 @@ out:
return ret;
}
-__attribute__((const))
-static int btrfs_readpage_io_failed_hook(struct page *page, int failed_mirror)
-{
- return -EAGAIN;
-}
-
-static void btrfs_check_extent_io_range(void *private_data, const char *caller,
- u64 start, u64 end)
-{
- struct inode *inode = private_data;
- u64 isize;
-
- isize = i_size_read(inode);
- if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
- btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
- "%s: ino %llu isize %llu odd range [%llu,%llu]",
- caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
- }
-}
-
void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
{
struct inode *inode = tree->private_data;
@@ -10481,6 +10418,343 @@ void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
}
}
+#ifdef CONFIG_SWAP
+/*
+ * Add an entry indicating a block group or device which is pinned by a
+ * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
+ * negative errno on failure.
+ */
+static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
+ bool is_block_group)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_swapfile_pin *sp, *entry;
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+
+ sp = kmalloc(sizeof(*sp), GFP_NOFS);
+ if (!sp)
+ return -ENOMEM;
+ sp->ptr = ptr;
+ sp->inode = inode;
+ sp->is_block_group = is_block_group;
+
+ spin_lock(&fs_info->swapfile_pins_lock);
+ p = &fs_info->swapfile_pins.rb_node;
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
+ if (sp->ptr < entry->ptr ||
+ (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
+ p = &(*p)->rb_left;
+ } else if (sp->ptr > entry->ptr ||
+ (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
+ p = &(*p)->rb_right;
+ } else {
+ spin_unlock(&fs_info->swapfile_pins_lock);
+ kfree(sp);
+ return 1;
+ }
+ }
+ rb_link_node(&sp->node, parent, p);
+ rb_insert_color(&sp->node, &fs_info->swapfile_pins);
+ spin_unlock(&fs_info->swapfile_pins_lock);
+ return 0;
+}
+
+/* Free all of the entries pinned by this swapfile. */
+static void btrfs_free_swapfile_pins(struct inode *inode)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_swapfile_pin *sp;
+ struct rb_node *node, *next;
+
+ spin_lock(&fs_info->swapfile_pins_lock);
+ node = rb_first(&fs_info->swapfile_pins);
+ while (node) {
+ next = rb_next(node);
+ sp = rb_entry(node, struct btrfs_swapfile_pin, node);
+ if (sp->inode == inode) {
+ rb_erase(&sp->node, &fs_info->swapfile_pins);
+ if (sp->is_block_group)
+ btrfs_put_block_group(sp->ptr);
+ kfree(sp);
+ }
+ node = next;
+ }
+ spin_unlock(&fs_info->swapfile_pins_lock);
+}
+
+struct btrfs_swap_info {
+ u64 start;
+ u64 block_start;
+ u64 block_len;
+ u64 lowest_ppage;
+ u64 highest_ppage;
+ unsigned long nr_pages;
+ int nr_extents;
+};
+
+static int btrfs_add_swap_extent(struct swap_info_struct *sis,
+ struct btrfs_swap_info *bsi)
+{
+ unsigned long nr_pages;
+ u64 first_ppage, first_ppage_reported, next_ppage;
+ int ret;
+
+ first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
+ next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
+ PAGE_SIZE) >> PAGE_SHIFT;
+
+ if (first_ppage >= next_ppage)
+ return 0;
+ nr_pages = next_ppage - first_ppage;
+
+ first_ppage_reported = first_ppage;
+ if (bsi->start == 0)
+ first_ppage_reported++;
+ if (bsi->lowest_ppage > first_ppage_reported)
+ bsi->lowest_ppage = first_ppage_reported;
+ if (bsi->highest_ppage < (next_ppage - 1))
+ bsi->highest_ppage = next_ppage - 1;
+
+ ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
+ if (ret < 0)
+ return ret;
+ bsi->nr_extents += ret;
+ bsi->nr_pages += nr_pages;
+ return 0;
+}
+
+static void btrfs_swap_deactivate(struct file *file)
+{
+ struct inode *inode = file_inode(file);
+
+ btrfs_free_swapfile_pins(inode);
+ atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
+}
+
+static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
+ sector_t *span)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_state *cached_state = NULL;
+ struct extent_map *em = NULL;
+ struct btrfs_device *device = NULL;
+ struct btrfs_swap_info bsi = {
+ .lowest_ppage = (sector_t)-1ULL,
+ };
+ int ret = 0;
+ u64 isize;
+ u64 start;
+
+ /*
+ * If the swap file was just created, make sure delalloc is done. If the
+ * file changes again after this, the user is doing something stupid and
+ * we don't really care.
+ */
+ ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ if (ret)
+ return ret;
+
+ /*
+ * The inode is locked, so these flags won't change after we check them.
+ */
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
+ btrfs_warn(fs_info, "swapfile must not be compressed");
+ return -EINVAL;
+ }
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
+ btrfs_warn(fs_info, "swapfile must not be copy-on-write");
+ return -EINVAL;
+ }
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+ btrfs_warn(fs_info, "swapfile must not be checksummed");
+ return -EINVAL;
+ }
+
+ /*
+ * Balance or device remove/replace/resize can move stuff around from
+ * under us. The EXCL_OP flag makes sure they aren't running/won't run
+ * concurrently while we are mapping the swap extents, and
+ * fs_info->swapfile_pins prevents them from running while the swap file
+ * is active and moving the extents. Note that this also prevents a
+ * concurrent device add which isn't actually necessary, but it's not
+ * really worth the trouble to allow it.
+ */
+ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ btrfs_warn(fs_info,
+ "cannot activate swapfile while exclusive operation is running");
+ return -EBUSY;
+ }
+ /*
+ * Snapshots can create extents which require COW even if NODATACOW is
+ * set. We use this counter to prevent snapshots. We must increment it
+ * before walking the extents because we don't want a concurrent
+ * snapshot to run after we've already checked the extents.
+ */
+ atomic_inc(&BTRFS_I(inode)->root->nr_swapfiles);
+
+ isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
+
+ lock_extent_bits(io_tree, 0, isize - 1, &cached_state);
+ start = 0;
+ while (start < isize) {
+ u64 logical_block_start, physical_block_start;
+ struct btrfs_block_group_cache *bg;
+ u64 len = isize - start;
+
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out;
+ }
+
+ if (em->block_start == EXTENT_MAP_HOLE) {
+ btrfs_warn(fs_info, "swapfile must not have holes");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (em->block_start == EXTENT_MAP_INLINE) {
+ /*
+ * It's unlikely we'll ever actually find ourselves
+ * here, as a file small enough to fit inline won't be
+ * big enough to store more than the swap header, but in
+ * case something changes in the future, let's catch it
+ * here rather than later.
+ */
+ btrfs_warn(fs_info, "swapfile must not be inline");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ btrfs_warn(fs_info, "swapfile must not be compressed");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ logical_block_start = em->block_start + (start - em->start);
+ len = min(len, em->len - (start - em->start));
+ free_extent_map(em);
+ em = NULL;
+
+ ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL);
+ if (ret < 0) {
+ goto out;
+ } else if (ret) {
+ ret = 0;
+ } else {
+ btrfs_warn(fs_info,
+ "swapfile must not be copy-on-write");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out;
+ }
+
+ if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ btrfs_warn(fs_info,
+ "swapfile must have single data profile");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (device == NULL) {
+ device = em->map_lookup->stripes[0].dev;
+ ret = btrfs_add_swapfile_pin(inode, device, false);
+ if (ret == 1)
+ ret = 0;
+ else if (ret)
+ goto out;
+ } else if (device != em->map_lookup->stripes[0].dev) {
+ btrfs_warn(fs_info, "swapfile must be on one device");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ physical_block_start = (em->map_lookup->stripes[0].physical +
+ (logical_block_start - em->start));
+ len = min(len, em->len - (logical_block_start - em->start));
+ free_extent_map(em);
+ em = NULL;
+
+ bg = btrfs_lookup_block_group(fs_info, logical_block_start);
+ if (!bg) {
+ btrfs_warn(fs_info,
+ "could not find block group containing swapfile");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_add_swapfile_pin(inode, bg, true);
+ if (ret) {
+ btrfs_put_block_group(bg);
+ if (ret == 1)
+ ret = 0;
+ else
+ goto out;
+ }
+
+ if (bsi.block_len &&
+ bsi.block_start + bsi.block_len == physical_block_start) {
+ bsi.block_len += len;
+ } else {
+ if (bsi.block_len) {
+ ret = btrfs_add_swap_extent(sis, &bsi);
+ if (ret)
+ goto out;
+ }
+ bsi.start = start;
+ bsi.block_start = physical_block_start;
+ bsi.block_len = len;
+ }
+
+ start += len;
+ }
+
+ if (bsi.block_len)
+ ret = btrfs_add_swap_extent(sis, &bsi);
+
+out:
+ if (!IS_ERR_OR_NULL(em))
+ free_extent_map(em);
+
+ unlock_extent_cached(io_tree, 0, isize - 1, &cached_state);
+
+ if (ret)
+ btrfs_swap_deactivate(file);
+
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+
+ if (ret)
+ return ret;
+
+ if (device)
+ sis->bdev = device->bdev;
+ *span = bsi.highest_ppage - bsi.lowest_ppage + 1;
+ sis->max = bsi.nr_pages;
+ sis->pages = bsi.nr_pages - 1;
+ sis->highest_bit = bsi.nr_pages - 1;
+ return bsi.nr_extents;
+}
+#else
+static void btrfs_swap_deactivate(struct file *file)
+{
+}
+
+static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
+ sector_t *span)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
@@ -10523,17 +10797,6 @@ static const struct extent_io_ops btrfs_extent_io_ops = {
/* mandatory callbacks */
.submit_bio_hook = btrfs_submit_bio_hook,
.readpage_end_io_hook = btrfs_readpage_end_io_hook,
- .readpage_io_failed_hook = btrfs_readpage_io_failed_hook,
-
- /* optional callbacks */
- .fill_delalloc = run_delalloc_range,
- .writepage_end_io_hook = btrfs_writepage_end_io_hook,
- .writepage_start_hook = btrfs_writepage_start_hook,
- .set_bit_hook = btrfs_set_bit_hook,
- .clear_bit_hook = btrfs_clear_bit_hook,
- .merge_extent_hook = btrfs_merge_extent_hook,
- .split_extent_hook = btrfs_split_extent_hook,
- .check_extent_io_range = btrfs_check_extent_io_range,
};
/*
@@ -10558,6 +10821,8 @@ static const struct address_space_operations btrfs_aops = {
.releasepage = btrfs_releasepage,
.set_page_dirty = btrfs_set_page_dirty,
.error_remove_page = generic_error_remove_page,
+ .swap_activate = btrfs_swap_activate,
+ .swap_deactivate = btrfs_swap_deactivate,
};
static const struct inode_operations btrfs_file_inode_operations = {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 802a628..fab9443 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -290,6 +290,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
} else if (fsflags & FS_COMPR_FL) {
const char *comp;
+ if (IS_SWAPFILE(inode)) {
+ ret = -ETXTBSY;
+ goto out_unlock;
+ }
+
binode->flags |= BTRFS_INODE_COMPRESS;
binode->flags &= ~BTRFS_INODE_NOCOMPRESS;
@@ -754,6 +759,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return -EINVAL;
+ if (atomic_read(&root->nr_swapfiles)) {
+ btrfs_warn(fs_info,
+ "cannot snapshot subvolume with active swapfile");
+ return -ETXTBSY;
+ }
+
pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_KERNEL);
if (!pending_snapshot)
return -ENOMEM;
@@ -777,7 +788,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
wait_event(root->subv_writers->wait,
percpu_counter_sum(&root->subv_writers->counter) == 0);
- ret = btrfs_start_delalloc_inodes(root);
+ ret = btrfs_start_delalloc_snapshot(root);
if (ret)
goto dec_and_free;
@@ -1505,9 +1516,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
}
inode_lock(inode);
- if (do_compress)
- BTRFS_I(inode)->defrag_compress = compress_type;
- ret = cluster_pages_for_defrag(inode, pages, i, cluster);
+ if (IS_SWAPFILE(inode)) {
+ ret = -ETXTBSY;
+ } else {
+ if (do_compress)
+ BTRFS_I(inode)->defrag_compress = compress_type;
+ ret = cluster_pages_for_defrag(inode, pages, i, cluster);
+ }
if (ret < 0) {
inode_unlock(inode);
goto out_ra;
@@ -3135,7 +3150,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
}
rcu_read_unlock();
- memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid));
+ memcpy(&fi_args->fsid, fs_devices->fsid, sizeof(fi_args->fsid));
fi_args->nodesize = fs_info->nodesize;
fi_args->sectorsize = fs_info->sectorsize;
fi_args->clone_alignment = fs_info->sectorsize;
@@ -3191,92 +3206,6 @@ out:
return ret;
}
-static struct page *extent_same_get_page(struct inode *inode, pgoff_t index)
-{
- struct page *page;
-
- page = grab_cache_page(inode->i_mapping, index);
- if (!page)
- return ERR_PTR(-ENOMEM);
-
- if (!PageUptodate(page)) {
- int ret;
-
- ret = btrfs_readpage(NULL, page);
- if (ret)
- return ERR_PTR(ret);
- lock_page(page);
- if (!PageUptodate(page)) {
- unlock_page(page);
- put_page(page);
- return ERR_PTR(-EIO);
- }
- if (page->mapping != inode->i_mapping) {
- unlock_page(page);
- put_page(page);
- return ERR_PTR(-EAGAIN);
- }
- }
-
- return page;
-}
-
-static int gather_extent_pages(struct inode *inode, struct page **pages,
- int num_pages, u64 off)
-{
- int i;
- pgoff_t index = off >> PAGE_SHIFT;
-
- for (i = 0; i < num_pages; i++) {
-again:
- pages[i] = extent_same_get_page(inode, index + i);
- if (IS_ERR(pages[i])) {
- int err = PTR_ERR(pages[i]);
-
- if (err == -EAGAIN)
- goto again;
- pages[i] = NULL;
- return err;
- }
- }
- return 0;
-}
-
-static int lock_extent_range(struct inode *inode, u64 off, u64 len,
- bool retry_range_locking)
-{
- /*
- * Do any pending delalloc/csum calculations on inode, one way or
- * another, and lock file content.
- * The locking order is:
- *
- * 1) pages
- * 2) range in the inode's io tree
- */
- while (1) {
- struct btrfs_ordered_extent *ordered;
- lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
- ordered = btrfs_lookup_first_ordered_extent(inode,
- off + len - 1);
- if ((!ordered ||
- ordered->file_offset + ordered->len <= off ||
- ordered->file_offset >= off + len) &&
- !test_range_bit(&BTRFS_I(inode)->io_tree, off,
- off + len - 1, EXTENT_DELALLOC, 0, NULL)) {
- if (ordered)
- btrfs_put_ordered_extent(ordered);
- break;
- }
- unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
- if (ordered)
- btrfs_put_ordered_extent(ordered);
- if (!retry_range_locking)
- return -EAGAIN;
- btrfs_wait_ordered_range(inode, off, len);
- }
- return 0;
-}
-
static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2)
{
inode_unlock(inode1);
@@ -3292,261 +3221,32 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
inode_lock_nested(inode2, I_MUTEX_CHILD);
}
-static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
-{
- unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
- unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
-}
-
-static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len,
- bool retry_range_locking)
-{
- int ret;
-
- if (inode1 < inode2) {
- swap(inode1, inode2);
- swap(loff1, loff2);
- }
- ret = lock_extent_range(inode1, loff1, len, retry_range_locking);
- if (ret)
- return ret;
- ret = lock_extent_range(inode2, loff2, len, retry_range_locking);
- if (ret)
- unlock_extent(&BTRFS_I(inode1)->io_tree, loff1,
- loff1 + len - 1);
- return ret;
-}
-
-struct cmp_pages {
- int num_pages;
- struct page **src_pages;
- struct page **dst_pages;
-};
-
-static void btrfs_cmp_data_free(struct cmp_pages *cmp)
-{
- int i;
- struct page *pg;
-
- for (i = 0; i < cmp->num_pages; i++) {
- pg = cmp->src_pages[i];
- if (pg) {
- unlock_page(pg);
- put_page(pg);
- cmp->src_pages[i] = NULL;
- }
- pg = cmp->dst_pages[i];
- if (pg) {
- unlock_page(pg);
- put_page(pg);
- cmp->dst_pages[i] = NULL;
- }
- }
-}
-
-static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
- struct inode *dst, u64 dst_loff,
- u64 len, struct cmp_pages *cmp)
-{
- int ret;
- int num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT;
-
- cmp->num_pages = num_pages;
-
- ret = gather_extent_pages(src, cmp->src_pages, num_pages, loff);
- if (ret)
- goto out;
-
- ret = gather_extent_pages(dst, cmp->dst_pages, num_pages, dst_loff);
-
-out:
- if (ret)
- btrfs_cmp_data_free(cmp);
- return ret;
-}
-
-static int btrfs_cmp_data(u64 len, struct cmp_pages *cmp)
-{
- int ret = 0;
- int i;
- struct page *src_page, *dst_page;
- unsigned int cmp_len = PAGE_SIZE;
- void *addr, *dst_addr;
-
- i = 0;
- while (len) {
- if (len < PAGE_SIZE)
- cmp_len = len;
-
- BUG_ON(i >= cmp->num_pages);
-
- src_page = cmp->src_pages[i];
- dst_page = cmp->dst_pages[i];
- ASSERT(PageLocked(src_page));
- ASSERT(PageLocked(dst_page));
-
- addr = kmap_atomic(src_page);
- dst_addr = kmap_atomic(dst_page);
-
- flush_dcache_page(src_page);
- flush_dcache_page(dst_page);
-
- if (memcmp(addr, dst_addr, cmp_len))
- ret = -EBADE;
-
- kunmap_atomic(addr);
- kunmap_atomic(dst_addr);
-
- if (ret)
- break;
-
- len -= cmp_len;
- i++;
- }
-
- return ret;
-}
-
-static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen,
- u64 olen)
-{
- u64 len = *plen;
- u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize;
-
- if (off + olen > inode->i_size || off + olen < off)
- return -EINVAL;
-
- /* if we extend to eof, continue to block boundary */
- if (off + len == inode->i_size)
- *plen = len = ALIGN(inode->i_size, bs) - off;
-
- /* Check that we are block aligned - btrfs_clone() requires this */
- if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs))
- return -EINVAL;
-
- return 0;
-}
-
static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen,
- struct inode *dst, u64 dst_loff,
- struct cmp_pages *cmp)
+ struct inode *dst, u64 dst_loff)
{
+ u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
int ret;
u64 len = olen;
- bool same_inode = (src == dst);
- u64 same_lock_start = 0;
- u64 same_lock_len = 0;
-
- ret = extent_same_check_offsets(src, loff, &len, olen);
- if (ret)
- return ret;
-
- ret = extent_same_check_offsets(dst, dst_loff, &len, olen);
- if (ret)
- return ret;
-
- if (same_inode) {
- /*
- * Single inode case wants the same checks, except we
- * don't want our length pushed out past i_size as
- * comparing that data range makes no sense.
- *
- * extent_same_check_offsets() will do this for an
- * unaligned length at i_size, so catch it here and
- * reject the request.
- *
- * This effectively means we require aligned extents
- * for the single-inode case, whereas the other cases
- * allow an unaligned length so long as it ends at
- * i_size.
- */
- if (len != olen)
- return -EINVAL;
-
- /* Check for overlapping ranges */
- if (dst_loff + len > loff && dst_loff < loff + len)
- return -EINVAL;
-
- same_lock_start = min_t(u64, loff, dst_loff);
- same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start;
- } else {
- /*
- * If the source and destination inodes are different, the
- * source's range end offset matches the source's i_size, that
- * i_size is not a multiple of the sector size, and the
- * destination range does not go past the destination's i_size,
- * we must round down the length to the nearest sector size
- * multiple. If we don't do this adjustment we end replacing
- * with zeroes the bytes in the range that starts at the
- * deduplication range's end offset and ends at the next sector
- * size multiple.
- */
- if (loff + olen == i_size_read(src) &&
- dst_loff + len < i_size_read(dst)) {
- const u64 sz = BTRFS_I(src)->root->fs_info->sectorsize;
-
- len = round_down(i_size_read(src), sz) - loff;
- if (len == 0)
- return 0;
- olen = len;
- }
- }
-
-again:
- ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, cmp);
- if (ret)
- return ret;
- if (same_inode)
- ret = lock_extent_range(src, same_lock_start, same_lock_len,
- false);
- else
- ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len,
- false);
+ if (loff + len == src->i_size)
+ len = ALIGN(src->i_size, bs) - loff;
/*
- * If one of the inodes has dirty pages in the respective range or
- * ordered extents, we need to flush dellaloc and wait for all ordered
- * extents in the range. We must unlock the pages and the ranges in the
- * io trees to avoid deadlocks when flushing delalloc (requires locking
- * pages) and when waiting for ordered extents to complete (they require
- * range locking).
+ * For same inode case we don't want our length pushed out past i_size
+ * as comparing that data range makes no sense.
+ *
+ * This effectively means we require aligned extents for the single
+ * inode case, whereas the other cases allow an unaligned length so long
+ * as it ends at i_size.
*/
- if (ret == -EAGAIN) {
- /*
- * Ranges in the io trees already unlocked. Now unlock all
- * pages before waiting for all IO to complete.
- */
- btrfs_cmp_data_free(cmp);
- if (same_inode) {
- btrfs_wait_ordered_range(src, same_lock_start,
- same_lock_len);
- } else {
- btrfs_wait_ordered_range(src, loff, len);
- btrfs_wait_ordered_range(dst, dst_loff, len);
- }
- goto again;
- }
- ASSERT(ret == 0);
- if (WARN_ON(ret)) {
- /* ranges in the io trees already unlocked */
- btrfs_cmp_data_free(cmp);
- return ret;
- }
-
- /* pass original length for comparison so we stay within i_size */
- ret = btrfs_cmp_data(olen, cmp);
- if (ret == 0)
- ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
-
- if (same_inode)
- unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start,
- same_lock_start + same_lock_len - 1);
- else
- btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
+ if (dst == src && len != olen)
+ return -EINVAL;
- btrfs_cmp_data_free(cmp);
+ /*
+ * Lock destination range to serialize with concurrent readpages().
+ */
+ lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, dst_loff + len - 1);
+ ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
+ unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, dst_loff + len - 1);
return ret;
}
@@ -3557,58 +3257,27 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
struct inode *dst, u64 dst_loff)
{
int ret;
- struct cmp_pages cmp;
int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT;
- bool same_inode = (src == dst);
u64 i, tail_len, chunk_count;
- if (olen == 0)
- return 0;
-
- if (same_inode)
- inode_lock(src);
- else
- btrfs_double_inode_lock(src, dst);
-
/* don't make the dst file partly checksummed */
if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
- (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) {
- ret = -EINVAL;
- goto out_unlock;
- }
+ (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM))
+ return -EINVAL;
+
+ if (IS_SWAPFILE(src) || IS_SWAPFILE(dst))
+ return -ETXTBSY;
tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
if (chunk_count == 0)
num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT;
- /*
- * If deduping ranges in the same inode, locking rules make it
- * mandatory to always lock pages in ascending order to avoid deadlocks
- * with concurrent tasks (such as starting writeback/delalloc).
- */
- if (same_inode && dst_loff < loff)
- swap(loff, dst_loff);
-
- /*
- * We must gather up all the pages before we initiate our extent
- * locking. We use an array for the page pointers. Size of the array is
- * bounded by len, which is in turn bounded by BTRFS_MAX_DEDUPE_LEN.
- */
- cmp.src_pages = kvmalloc_array(num_pages, sizeof(struct page *),
- GFP_KERNEL | __GFP_ZERO);
- cmp.dst_pages = kvmalloc_array(num_pages, sizeof(struct page *),
- GFP_KERNEL | __GFP_ZERO);
- if (!cmp.src_pages || !cmp.dst_pages) {
- ret = -ENOMEM;
- goto out_free;
- }
-
for (i = 0; i < chunk_count; i++) {
ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
- dst, dst_loff, &cmp);
+ dst, dst_loff);
if (ret)
- goto out_free;
+ return ret;
loff += BTRFS_MAX_DEDUPE_LEN;
dst_loff += BTRFS_MAX_DEDUPE_LEN;
@@ -3616,17 +3285,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
if (tail_len > 0)
ret = btrfs_extent_same_range(src, loff, tail_len, dst,
- dst_loff, &cmp);
-
-out_free:
- kvfree(cmp.src_pages);
- kvfree(cmp.dst_pages);
-
-out_unlock:
- if (same_inode)
- inode_unlock(src);
- else
- btrfs_double_inode_unlock(src, dst);
+ dst_loff);
return ret;
}
@@ -4213,11 +3872,9 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
struct inode *inode = file_inode(file);
struct inode *src = file_inode(file_src);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
u64 len = olen;
u64 bs = fs_info->sb->s_blocksize;
- int same_inode = src == inode;
/*
* TODO:
@@ -4230,101 +3887,35 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
* be either compressed or non-compressed.
*/
- if (btrfs_root_readonly(root))
- return -EROFS;
-
- if (file_src->f_path.mnt != file->f_path.mnt ||
- src->i_sb != inode->i_sb)
- return -EXDEV;
-
- if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
- return -EISDIR;
-
- if (!same_inode) {
- btrfs_double_inode_lock(src, inode);
- } else {
- inode_lock(src);
- }
-
/* don't make the dst file partly checksummed */
if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
- (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
- ret = -EINVAL;
- goto out_unlock;
- }
+ (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
+ return -EINVAL;
+
+ if (IS_SWAPFILE(src) || IS_SWAPFILE(inode))
+ return -ETXTBSY;
- /* determine range to clone */
- ret = -EINVAL;
- if (off + len > src->i_size || off + len < off)
- goto out_unlock;
- if (len == 0)
- olen = len = src->i_size - off;
/*
- * If we extend to eof, continue to block boundary if and only if the
- * destination end offset matches the destination file's size, otherwise
- * we would be corrupting data by placing the eof block into the middle
- * of a file.
+ * VFS's generic_remap_file_range_prep() protects us from cloning the
+ * eof block into the middle of a file, which would result in corruption
+ * if the file size is not blocksize aligned. So we don't need to check
+ * for that case here.
*/
- if (off + len == src->i_size) {
- if (!IS_ALIGNED(len, bs) && destoff + len < inode->i_size)
- goto out_unlock;
+ if (off + len == src->i_size)
len = ALIGN(src->i_size, bs) - off;
- }
-
- if (len == 0) {
- ret = 0;
- goto out_unlock;
- }
-
- /* verify the end result is block aligned */
- if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
- !IS_ALIGNED(destoff, bs))
- goto out_unlock;
-
- /* verify if ranges are overlapped within the same file */
- if (same_inode) {
- if (destoff + len > off && destoff < off + len)
- goto out_unlock;
- }
if (destoff > inode->i_size) {
ret = btrfs_cont_expand(inode, inode->i_size, destoff);
if (ret)
- goto out_unlock;
+ return ret;
}
/*
- * Lock the target range too. Right after we replace the file extent
- * items in the fs tree (which now point to the cloned data), we might
- * have a worker replace them with extent items relative to a write
- * operation that was issued before this clone operation (i.e. confront
- * with inode.c:btrfs_finish_ordered_io).
+ * Lock destination range to serialize with concurrent readpages().
*/
- if (same_inode) {
- u64 lock_start = min_t(u64, off, destoff);
- u64 lock_len = max_t(u64, off, destoff) + len - lock_start;
-
- ret = lock_extent_range(src, lock_start, lock_len, true);
- } else {
- ret = btrfs_double_extent_lock(src, off, inode, destoff, len,
- true);
- }
- ASSERT(ret == 0);
- if (WARN_ON(ret)) {
- /* ranges in the io trees already unlocked */
- goto out_unlock;
- }
-
+ lock_extent(&BTRFS_I(inode)->io_tree, destoff, destoff + len - 1);
ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
-
- if (same_inode) {
- u64 lock_start = min_t(u64, off, destoff);
- u64 lock_end = max_t(u64, off, destoff) + len - 1;
-
- unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end);
- } else {
- btrfs_double_extent_unlock(src, off, inode, destoff, len);
- }
+ unlock_extent(&BTRFS_I(inode)->io_tree, destoff, destoff + len - 1);
/*
* Truncate page cache pages so that future reads will see the cloned
* data immediately and not the previous data.
@@ -4332,11 +3923,87 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
truncate_inode_pages_range(&inode->i_data,
round_down(destoff, PAGE_SIZE),
round_up(destoff + len, PAGE_SIZE) - 1);
-out_unlock:
+
+ return ret;
+}
+
+static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
+ bool same_inode = inode_out == inode_in;
+ u64 wb_len;
+ int ret;
+
+ if (!(remap_flags & REMAP_FILE_DEDUP)) {
+ struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
+
+ if (btrfs_root_readonly(root_out))
+ return -EROFS;
+
+ if (file_in->f_path.mnt != file_out->f_path.mnt ||
+ inode_in->i_sb != inode_out->i_sb)
+ return -EXDEV;
+ }
+
+ if (same_inode)
+ inode_lock(inode_in);
+ else
+ btrfs_double_inode_lock(inode_in, inode_out);
+
+ /*
+ * Now that the inodes are locked, we need to start writeback ourselves
+ * and can not rely on the writeback from the VFS's generic helper
+ * generic_remap_file_range_prep() because:
+ *
+ * 1) For compression we must call filemap_fdatawrite_range() range
+ * twice (btrfs_fdatawrite_range() does it for us), and the generic
+ * helper only calls it once;
+ *
+ * 2) filemap_fdatawrite_range(), called by the generic helper only
+ * waits for the writeback to complete, i.e. for IO to be done, and
+ * not for the ordered extents to complete. We need to wait for them
+ * to complete so that new file extent items are in the fs tree.
+ */
+ if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
+ wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
+ else
+ wb_len = ALIGN(*len, bs);
+
+ /*
+ * Since we don't lock ranges, wait for ongoing lockless dio writes (as
+ * any in progress could create its ordered extents after we wait for
+ * existing ordered extents below).
+ */
+ inode_dio_wait(inode_in);
if (!same_inode)
- btrfs_double_inode_unlock(src, inode);
+ inode_dio_wait(inode_out);
+
+ ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
+ wb_len);
+ if (ret < 0)
+ goto out_unlock;
+ ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
+ wb_len);
+ if (ret < 0)
+ goto out_unlock;
+
+ ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
+ len, remap_flags);
+ if (ret < 0 || *len == 0)
+ goto out_unlock;
+
+ return 0;
+
+ out_unlock:
+ if (same_inode)
+ inode_unlock(inode_in);
else
- inode_unlock(src);
+ btrfs_double_inode_unlock(inode_in, inode_out);
+
return ret;
}
@@ -4344,29 +4011,29 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
struct file *dst_file, loff_t destoff, loff_t len,
unsigned int remap_flags)
{
+ struct inode *src_inode = file_inode(src_file);
+ struct inode *dst_inode = file_inode(dst_file);
+ bool same_inode = dst_inode == src_inode;
int ret;
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
- if (remap_flags & REMAP_FILE_DEDUP) {
- struct inode *src = file_inode(src_file);
- struct inode *dst = file_inode(dst_file);
- u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
-
- if (WARN_ON_ONCE(bs < PAGE_SIZE)) {
- /*
- * Btrfs does not support blocksize < page_size. As a
- * result, btrfs_cmp_data() won't correctly handle
- * this situation without an update.
- */
- return -EINVAL;
- }
+ ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
+ &len, remap_flags);
+ if (ret < 0 || len == 0)
+ return ret;
- ret = btrfs_extent_same(src, off, len, dst, destoff);
- } else {
+ if (remap_flags & REMAP_FILE_DEDUP)
+ ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
+ else
ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
- }
+
+ if (same_inode)
+ inode_unlock(src_inode);
+ else
+ btrfs_double_inode_unlock(src_inode, dst_inode);
+
return ret < 0 ? ret : len;
}
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index b6a4cc1..9063914 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -27,7 +27,7 @@
* Records the total size (including the header) of compressed data.
*
* 2. Segment(s)
- * Variable size. Each segment includes one segment header, followd by data
+ * Variable size. Each segment includes one segment header, followed by data
* payload.
* One regular LZO compressed extent can have one or more segments.
* For inlined LZO compressed extent, only one segment is allowed.
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 0c4ef20..6fde2b2 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -460,7 +460,6 @@ void btrfs_remove_ordered_extent(struct inode *inode,
struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
struct btrfs_root *root = btrfs_inode->root;
struct rb_node *node;
- bool dec_pending_ordered = false;
/* This is paired with btrfs_add_ordered_extent. */
spin_lock(&btrfs_inode->lock);
@@ -477,37 +476,8 @@ void btrfs_remove_ordered_extent(struct inode *inode,
if (tree->last == node)
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
- if (test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags))
- dec_pending_ordered = true;
spin_unlock_irq(&tree->lock);
- /*
- * The current running transaction is waiting on us, we need to let it
- * know that we're complete and wake it up.
- */
- if (dec_pending_ordered) {
- struct btrfs_transaction *trans;
-
- /*
- * The checks for trans are just a formality, it should be set,
- * but if it isn't we don't want to deref/assert under the spin
- * lock, so be nice and check if trans is set, but ASSERT() so
- * if it isn't set a developer will notice.
- */
- spin_lock(&fs_info->trans_lock);
- trans = fs_info->running_transaction;
- if (trans)
- refcount_inc(&trans->use_count);
- spin_unlock(&fs_info->trans_lock);
-
- ASSERT(trans);
- if (trans) {
- if (atomic_dec_and_test(&trans->pending_ordered))
- wake_up(&trans->pending_wait);
- btrfs_put_transaction(trans);
- }
- }
-
spin_lock(&root->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
root->nr_ordered_extents--;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 02d813a..fb9a161 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -37,28 +37,31 @@ struct btrfs_ordered_sum {
* rbtree, just before waking any waiters. It is used to indicate the
* IO is done and any metadata is inserted into the tree.
*/
-#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
-
-#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
-
-#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
-
-#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
-
-#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to preallocated extent */
-
-#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
-
-#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
-
-#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
- * has done its due diligence in updating
- * the isize. */
-#define BTRFS_ORDERED_TRUNCATED 8 /* Set when we have to truncate an extent */
-
-#define BTRFS_ORDERED_PENDING 9 /* We are waiting for this ordered extent to
- * complete in the current transaction. */
-#define BTRFS_ORDERED_REGULAR 10 /* Regular IO for COW */
+enum {
+ /* set when all the pages are written */
+ BTRFS_ORDERED_IO_DONE,
+ /* set when removed from the tree */
+ BTRFS_ORDERED_COMPLETE,
+ /* set when we want to write in place */
+ BTRFS_ORDERED_NOCOW,
+ /* writing a zlib compressed extent */
+ BTRFS_ORDERED_COMPRESSED,
+ /* set when writing to preallocated extent */
+ BTRFS_ORDERED_PREALLOC,
+ /* set when we're doing DIO with this extent */
+ BTRFS_ORDERED_DIRECT,
+ /* We had an io error when writing this out */
+ BTRFS_ORDERED_IOERR,
+ /*
+ * indicates whether this ordered extent has done its due diligence in
+ * updating the isize
+ */
+ BTRFS_ORDERED_UPDATED_ISIZE,
+ /* Set when we have to truncate an extent */
+ BTRFS_ORDERED_TRUNCATED,
+ /* Regular IO for COW */
+ BTRFS_ORDERED_REGULAR,
+};
struct btrfs_ordered_extent {
/* logical offset in the file */
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 45868fd..4e473a9 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -30,7 +30,7 @@
* - sync
* - copy also limits on subvol creation
* - limit
- * - caches fuer ulists
+ * - caches for ulists
* - performance benchmarks
* - check all ioctl parameters
*/
@@ -522,7 +522,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
__del_qgroup_rb(qgroup);
}
/*
- * we call btrfs_free_qgroup_config() when umounting
+ * We call btrfs_free_qgroup_config() when unmounting
* filesystem and disabling quota, so we set qgroup_ulist
* to be null here to avoid double free.
*/
@@ -1013,16 +1013,22 @@ out_add_root:
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
- spin_lock(&fs_info->qgroup_lock);
- fs_info->quota_root = quota_root;
- set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
- spin_unlock(&fs_info->qgroup_lock);
ret = btrfs_commit_transaction(trans);
trans = NULL;
if (ret)
goto out_free_path;
+ /*
+ * Set quota enabled flag after committing the transaction, to avoid
+ * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot
+ * creation.
+ */
+ spin_lock(&fs_info->qgroup_lock);
+ fs_info->quota_root = quota_root;
+ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+ spin_unlock(&fs_info->qgroup_lock);
+
ret = qgroup_rescan_init(fs_info, 0, 1);
if (!ret) {
qgroup_rescan_zero_tracking(fs_info);
@@ -1122,7 +1128,7 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info,
* The easy accounting, we're updating qgroup relationship whose child qgroup
* only has exclusive extents.
*
- * In this case, all exclsuive extents will also be exlusive for parent, so
+ * In this case, all exclusive extents will also be exclusive for parent, so
* excl/rfer just get added/removed.
*
* So is qgroup reservation space, which should also be added/removed to
@@ -1749,14 +1755,14 @@ static int adjust_slots_upwards(struct btrfs_path *path, int root_level)
*
* 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
* NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
- * They should be marked during preivous (@dst_level = 1) iteration.
+ * They should be marked during previous (@dst_level = 1) iteration.
*
* 3) Mark file extents in leaves dirty
* We don't have good way to pick out new file extents only.
* So we still follow the old method by scanning all file extents in
* the leave.
*
- * This function can free us from keeping two pathes, thus later we only need
+ * This function can free us from keeping two paths, thus later we only need
* to care about how to iterate all new tree blocks in reloc tree.
*/
static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
@@ -1895,7 +1901,7 @@ out:
*
* We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
* above tree blocks along with their counter parts in file tree.
- * While during search, old tree blocsk OO(c) will be skiped as tree block swap
+ * While during search, old tree blocks OO(c) will be skipped as tree block swap
* won't affect OO(c).
*/
static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
@@ -2020,7 +2026,7 @@ out:
* Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and
* @dst_slot), and find any tree blocks whose generation is at @last_snapshot,
* and then go down @src_eb (pointed by @src_parent and @src_slot) to find
- * the conterpart of the tree block, then mark both tree blocks as qgroup dirty,
+ * the counterpart of the tree block, then mark both tree blocks as qgroup dirty,
* and skip all tree blocks whose generation is smaller than last_snapshot.
*
* This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(),
@@ -2659,7 +2665,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
int i;
u64 *i_qgroups;
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *quota_root = fs_info->quota_root;
+ struct btrfs_root *quota_root;
struct btrfs_qgroup *srcgroup;
struct btrfs_qgroup *dstgroup;
u32 level_size = 0;
@@ -2669,6 +2675,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
goto out;
+ quota_root = fs_info->quota_root;
if (!quota_root) {
ret = -EINVAL;
goto out;
@@ -3103,9 +3110,6 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
mutex_unlock(&fs_info->qgroup_rescan_lock);
goto out;
}
- extent_buffer_get(scratch_leaf);
- btrfs_tree_read_lock(scratch_leaf);
- btrfs_set_lock_blocking_rw(scratch_leaf, BTRFS_READ_LOCK);
slot = path->slots[0];
btrfs_release_path(path);
mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -3131,10 +3135,8 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
goto out;
}
out:
- if (scratch_leaf) {
- btrfs_tree_read_unlock_blocking(scratch_leaf);
+ if (scratch_leaf)
free_extent_buffer(scratch_leaf);
- }
if (done && !ret) {
ret = 1;
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index d8f78f5..20c6bd5 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -70,7 +70,7 @@ struct btrfs_qgroup_extent_record {
* be converted into META_PERTRANS.
*/
enum btrfs_qgroup_rsv_type {
- BTRFS_QGROUP_RSV_DATA = 0,
+ BTRFS_QGROUP_RSV_DATA,
BTRFS_QGROUP_RSV_META_PERTRANS,
BTRFS_QGROUP_RSV_META_PREALLOC,
BTRFS_QGROUP_RSV_LAST,
@@ -81,10 +81,10 @@ enum btrfs_qgroup_rsv_type {
*
* Each type should have different reservation behavior.
* E.g, data follows its io_tree flag modification, while
- * *currently* meta is just reserve-and-clear during transcation.
+ * *currently* meta is just reserve-and-clear during transaction.
*
* TODO: Add new type for reservation which can survive transaction commit.
- * Currect metadata reservation behavior is not suitable for such case.
+ * Current metadata reservation behavior is not suitable for such case.
*/
struct btrfs_qgroup_rsv {
u64 values[BTRFS_QGROUP_RSV_LAST];
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index df41d70..e74455e 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1980,7 +1980,7 @@ cleanup_io:
* - In case of single failure, where rbio->failb == -1:
*
* Cache this rbio iff the above read reconstruction is
- * excuted without problems.
+ * executed without problems.
*/
if (err == BLK_STS_OK && rbio->failb < 0)
cache_rbio_pages(rbio);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index dec14b7..10d9589 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -376,26 +376,28 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
goto error;
}
+ /* Insert extent in reada tree + all per-device trees, all or nothing */
+ down_read(&fs_info->dev_replace.rwsem);
ret = radix_tree_preload(GFP_KERNEL);
- if (ret)
+ if (ret) {
+ up_read(&fs_info->dev_replace.rwsem);
goto error;
+ }
- /* insert extent in reada_tree + all per-device trees, all or nothing */
- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
spin_lock(&fs_info->reada_lock);
ret = radix_tree_insert(&fs_info->reada_tree, index, re);
if (ret == -EEXIST) {
re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
re_exist->refcnt++;
spin_unlock(&fs_info->reada_lock);
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
radix_tree_preload_end();
+ up_read(&fs_info->dev_replace.rwsem);
goto error;
}
if (ret) {
spin_unlock(&fs_info->reada_lock);
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
radix_tree_preload_end();
+ up_read(&fs_info->dev_replace.rwsem);
goto error;
}
radix_tree_preload_end();
@@ -437,13 +439,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
}
radix_tree_delete(&fs_info->reada_tree, index);
spin_unlock(&fs_info->reada_lock);
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+ up_read(&fs_info->dev_replace.rwsem);
goto error;
}
have_zone = 1;
}
spin_unlock(&fs_info->reada_lock);
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+ up_read(&fs_info->dev_replace.rwsem);
if (!have_zone)
goto error;
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index d69fbfb..c3557c1 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -43,7 +43,7 @@ struct ref_entry {
* back to the delayed ref action. We hold the ref we are changing in the
* action so we can account for the history properly, and we record the root we
* were called with since it could be different from ref_root. We also store
- * stack traces because thats how I roll.
+ * stack traces because that's how I roll.
*/
struct ref_action {
int action;
@@ -56,7 +56,7 @@ struct ref_action {
/*
* One of these for every block we reference, it holds the roots and references
- * to it as well as all of the ref actions that have occured to it. We never
+ * to it as well as all of the ref actions that have occurred to it. We never
* free it until we unmount the file system in order to make sure re-allocations
* are happening properly.
*/
@@ -859,7 +859,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
* This shouldn't happen because we will add our re
* above when we lookup the be with !parent, but just in
* case catch this case so we don't panic because I
- * didn't thik of some other corner case.
+ * didn't think of some other corner case.
*/
btrfs_err(fs_info, "failed to find root %llu for %llu",
root->root_key.objectid, be->bytenr);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 924116f..272b287 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2631,7 +2631,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
* only one thread can access block_rsv at this point,
* so we don't need hold lock to protect block_rsv.
* we expand more reservation size here to allow enough
- * space for relocation and we will return eailer in
+ * space for relocation and we will return earlier in
* enospc case.
*/
rc->block_rsv->size = tmp + fs_info->nodesize *
@@ -3959,6 +3959,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
restart:
if (update_backref_cache(trans, &rc->backref_cache)) {
btrfs_end_transaction(trans);
+ trans = NULL;
continue;
}
@@ -4184,37 +4185,13 @@ static struct reloc_control *alloc_reloc_control(void)
static void describe_relocation(struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group)
{
- char buf[128]; /* prefixed by a '|' that'll be dropped */
- u64 flags = block_group->flags;
+ char buf[128] = {'\0'};
- /* Shouldn't happen */
- if (!flags) {
- strcpy(buf, "|NONE");
- } else {
- char *bp = buf;
-
-#define DESCRIBE_FLAG(f, d) \
- if (flags & BTRFS_BLOCK_GROUP_##f) { \
- bp += snprintf(bp, buf - bp + sizeof(buf), "|%s", d); \
- flags &= ~BTRFS_BLOCK_GROUP_##f; \
- }
- DESCRIBE_FLAG(DATA, "data");
- DESCRIBE_FLAG(SYSTEM, "system");
- DESCRIBE_FLAG(METADATA, "metadata");
- DESCRIBE_FLAG(RAID0, "raid0");
- DESCRIBE_FLAG(RAID1, "raid1");
- DESCRIBE_FLAG(DUP, "dup");
- DESCRIBE_FLAG(RAID10, "raid10");
- DESCRIBE_FLAG(RAID5, "raid5");
- DESCRIBE_FLAG(RAID6, "raid6");
- if (flags)
- snprintf(bp, buf - bp + sizeof(buf), "|0x%llx", flags);
-#undef DESCRIBE_FLAG
- }
+ btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
btrfs_info(fs_info,
"relocating block group %llu flags %s",
- block_group->key.objectid, buf + 1);
+ block_group->key.objectid, buf);
}
/*
@@ -4222,6 +4199,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info,
*/
int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
{
+ struct btrfs_block_group_cache *bg;
struct btrfs_root *extent_root = fs_info->extent_root;
struct reloc_control *rc;
struct inode *inode;
@@ -4230,14 +4208,23 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
int rw = 0;
int err = 0;
+ bg = btrfs_lookup_block_group(fs_info, group_start);
+ if (!bg)
+ return -ENOENT;
+
+ if (btrfs_pinned_by_swapfile(fs_info, bg)) {
+ btrfs_put_block_group(bg);
+ return -ETXTBSY;
+ }
+
rc = alloc_reloc_control();
- if (!rc)
+ if (!rc) {
+ btrfs_put_block_group(bg);
return -ENOMEM;
+ }
rc->extent_root = extent_root;
-
- rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
- BUG_ON(!rc->block_group);
+ rc->block_group = bg;
ret = btrfs_inc_block_group_ro(rc->block_group);
if (ret) {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 902819d..6dcd36d 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -339,7 +339,9 @@ static struct full_stripe_lock *insert_full_stripe_lock(
}
}
- /* Insert new lock */
+ /*
+ * Insert new lock.
+ */
ret = kmalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return ERR_PTR(-ENOMEM);
@@ -568,12 +570,11 @@ static void scrub_put_ctx(struct scrub_ctx *sctx)
scrub_free_ctx(sctx);
}
-static noinline_for_stack
-struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
+static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
+ struct btrfs_fs_info *fs_info, int is_dev_replace)
{
struct scrub_ctx *sctx;
int i;
- struct btrfs_fs_info *fs_info = dev->fs_info;
sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
if (!sctx)
@@ -582,7 +583,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
sctx->is_dev_replace = is_dev_replace;
sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
sctx->curr = -1;
- sctx->fs_info = dev->fs_info;
+ sctx->fs_info = fs_info;
for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
struct scrub_bio *sbio;
@@ -832,6 +833,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
int page_num;
int success;
bool full_stripe_locked;
+ unsigned int nofs_flag;
static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
@@ -857,6 +859,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
dev = sblock_to_check->pagev[0]->dev;
/*
+ * We must use GFP_NOFS because the scrub task might be waiting for a
+ * worker task executing this function and in turn a transaction commit
+ * might be waiting the scrub task to pause (which needs to wait for all
+ * the worker tasks to complete before pausing).
+ * We do allocations in the workers through insert_full_stripe_lock()
+ * and scrub_add_page_to_wr_bio(), which happens down the call chain of
+ * this function.
+ */
+ nofs_flag = memalloc_nofs_save();
+ /*
* For RAID5/6, race can happen for a different device scrub thread.
* For data corruption, Parity and Data threads will both try
* to recovery the data.
@@ -865,6 +877,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
*/
ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
if (ret < 0) {
+ memalloc_nofs_restore(nofs_flag);
spin_lock(&sctx->stat_lock);
if (ret == -ENOMEM)
sctx->stat.malloc_errors++;
@@ -904,7 +917,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
*/
sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
- sizeof(*sblocks_for_recheck), GFP_NOFS);
+ sizeof(*sblocks_for_recheck), GFP_KERNEL);
if (!sblocks_for_recheck) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -1202,6 +1215,7 @@ out:
}
ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
+ memalloc_nofs_restore(nofs_flag);
if (ret < 0)
return ret;
return 0;
@@ -3540,7 +3554,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (!ret && sctx->is_dev_replace) {
/*
* If we are doing a device replace wait for any tasks
- * that started dellaloc right before we set the block
+ * that started delalloc right before we set the block
* group to RO mode, as they might have just allocated
* an extent from it or decided they could do a nocow
* write. And if any such tasks did that, wait for their
@@ -3596,11 +3610,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
break;
}
- btrfs_dev_replace_write_lock(&fs_info->dev_replace);
+ down_write(&fs_info->dev_replace.rwsem);
dev_replace->cursor_right = found_key.offset + length;
dev_replace->cursor_left = found_key.offset;
dev_replace->item_needs_writeback = 1;
- btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
+ up_write(&dev_replace->rwsem);
+
ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
found_key.offset, cache);
@@ -3636,10 +3651,10 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
scrub_pause_off(fs_info);
- btrfs_dev_replace_write_lock(&fs_info->dev_replace);
+ down_write(&fs_info->dev_replace.rwsem);
dev_replace->cursor_left = dev_replace->cursor_right;
dev_replace->item_needs_writeback = 1;
- btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
+ up_write(&fs_info->dev_replace.rwsem);
if (ro_set)
btrfs_dec_block_group_ro(cache);
@@ -3772,6 +3787,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
struct scrub_ctx *sctx;
int ret;
struct btrfs_device *dev;
+ unsigned int nofs_flag;
if (btrfs_fs_closing(fs_info))
return -EINVAL;
@@ -3813,13 +3829,18 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
return -EINVAL;
}
+ /* Allocate outside of device_list_mutex */
+ sctx = scrub_setup_ctx(fs_info, is_dev_replace);
+ if (IS_ERR(sctx))
+ return PTR_ERR(sctx);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
dev = btrfs_find_device(fs_info, devid, NULL, NULL);
if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
!is_dev_replace)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- return -ENODEV;
+ ret = -ENODEV;
+ goto out_free_ctx;
}
if (!is_dev_replace && !readonly &&
@@ -3827,7 +3848,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable",
rcu_str_deref(dev->name));
- return -EROFS;
+ ret = -EROFS;
+ goto out_free_ctx;
}
mutex_lock(&fs_info->scrub_lock);
@@ -3835,34 +3857,29 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- return -EIO;
+ ret = -EIO;
+ goto out_free_ctx;
}
- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
+ down_read(&fs_info->dev_replace.rwsem);
if (dev->scrub_ctx ||
(!is_dev_replace &&
btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+ up_read(&fs_info->dev_replace.rwsem);
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- return -EINPROGRESS;
+ ret = -EINPROGRESS;
+ goto out_free_ctx;
}
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+ up_read(&fs_info->dev_replace.rwsem);
ret = scrub_workers_get(fs_info, is_dev_replace);
if (ret) {
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- return ret;
+ goto out_free_ctx;
}
- sctx = scrub_setup_ctx(dev, is_dev_replace);
- if (IS_ERR(sctx)) {
- mutex_unlock(&fs_info->scrub_lock);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- scrub_workers_put(fs_info);
- return PTR_ERR(sctx);
- }
sctx->readonly = readonly;
dev->scrub_ctx = sctx;
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -3875,6 +3892,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
atomic_inc(&fs_info->scrubs_running);
mutex_unlock(&fs_info->scrub_lock);
+ /*
+ * In order to avoid deadlock with reclaim when there is a transaction
+ * trying to pause scrub, make sure we use GFP_NOFS for all the
+ * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
+ * invoked by our callees. The pausing request is done when the
+ * transaction commit starts, and it blocks the transaction until scrub
+ * is paused (done at specific points at scrub_stripe() or right above
+ * before incrementing fs_info->scrubs_running).
+ */
+ nofs_flag = memalloc_nofs_save();
if (!is_dev_replace) {
/*
* by holding device list mutex, we can
@@ -3887,6 +3914,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (!ret)
ret = scrub_enumerate_chunks(sctx, dev, start, end);
+ memalloc_nofs_restore(nofs_flag);
wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
atomic_dec(&fs_info->scrubs_running);
@@ -3905,6 +3933,11 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
scrub_put_ctx(sctx);
return ret;
+
+out_free_ctx:
+ scrub_free_ctx(sctx);
+
+ return ret;
}
void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 094cc144..7ea2d6b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2238,7 +2238,7 @@ out:
* inodes "orphan" name instead of the real name and stop. Same with new inodes
* that were not created yet and overwritten inodes/refs.
*
- * When do we have have orphan inodes:
+ * When do we have orphan inodes:
* 1. When an inode is freshly created and thus no valid refs are available yet
* 2. When a directory lost all it's refs (deleted) but still has dir items
* inside which were not processed yet (pending for move/delete). If anyone
@@ -3340,7 +3340,8 @@ static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m)
kfree(m);
}
-static void tail_append_pending_moves(struct pending_dir_move *moves,
+static void tail_append_pending_moves(struct send_ctx *sctx,
+ struct pending_dir_move *moves,
struct list_head *stack)
{
if (list_empty(&moves->list)) {
@@ -3351,6 +3352,10 @@ static void tail_append_pending_moves(struct pending_dir_move *moves,
list_add_tail(&moves->list, stack);
list_splice_tail(&list, stack);
}
+ if (!RB_EMPTY_NODE(&moves->node)) {
+ rb_erase(&moves->node, &sctx->pending_dir_moves);
+ RB_CLEAR_NODE(&moves->node);
+ }
}
static int apply_children_dir_moves(struct send_ctx *sctx)
@@ -3365,7 +3370,7 @@ static int apply_children_dir_moves(struct send_ctx *sctx)
return 0;
INIT_LIST_HEAD(&stack);
- tail_append_pending_moves(pm, &stack);
+ tail_append_pending_moves(sctx, pm, &stack);
while (!list_empty(&stack)) {
pm = list_first_entry(&stack, struct pending_dir_move, list);
@@ -3376,7 +3381,7 @@ static int apply_children_dir_moves(struct send_ctx *sctx)
goto out;
pm = get_pending_dir_moves(sctx, parent_ino);
if (pm)
- tail_append_pending_moves(pm, &stack);
+ tail_append_pending_moves(sctx, pm, &stack);
}
return 0;
@@ -3849,7 +3854,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
/*
* We may have refs where the parent directory does not exist
* yet. This happens if the parent directories inum is higher
- * the the current inum. To handle this case, we create the
+ * than the current inum. To handle this case, we create the
* parent directory out of order. But we need to check if this
* did already happen before due to other refs in the same dir.
*/
@@ -4770,7 +4775,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
struct btrfs_key key;
pgoff_t index = offset >> PAGE_SHIFT;
pgoff_t last_index;
- unsigned pg_offset = offset & ~PAGE_MASK;
+ unsigned pg_offset = offset_in_page(offset);
ssize_t ret = 0;
key.objectid = sctx->cur_ino;
@@ -6641,7 +6646,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
goto out;
}
- if (!access_ok(VERIFY_READ, arg->clone_sources,
+ if (!access_ok(arg->clone_sources,
sizeof(*arg->clone_sources) *
arg->clone_sources_count)) {
ret = -EFAULT;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index cbc9d0d..c5586ff 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -93,7 +93,7 @@ const char *btrfs_decode_error(int errno)
/*
* __btrfs_handle_fs_error decodes expected errors from the caller and
- * invokes the approciate error response.
+ * invokes the appropriate error response.
*/
__cold
void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
@@ -151,7 +151,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
* although there is no way to update the progress. It would add the
* risk of a deadlock, therefore the canceling is omitted. The only
* penalty is that some I/O remains active until the procedure
- * completes. The next time when the filesystem is mounted writeable
+ * completes. The next time when the filesystem is mounted writable
* again, the device replace operation continues.
*/
}
@@ -1458,56 +1458,6 @@ out:
return root;
}
-static int parse_security_options(char *orig_opts,
- struct security_mnt_opts *sec_opts)
-{
- char *secdata = NULL;
- int ret = 0;
-
- secdata = alloc_secdata();
- if (!secdata)
- return -ENOMEM;
- ret = security_sb_copy_data(orig_opts, secdata);
- if (ret) {
- free_secdata(secdata);
- return ret;
- }
- ret = security_sb_parse_opts_str(secdata, sec_opts);
- free_secdata(secdata);
- return ret;
-}
-
-static int setup_security_options(struct btrfs_fs_info *fs_info,
- struct super_block *sb,
- struct security_mnt_opts *sec_opts)
-{
- int ret = 0;
-
- /*
- * Call security_sb_set_mnt_opts() to check whether new sec_opts
- * is valid.
- */
- ret = security_sb_set_mnt_opts(sb, sec_opts, 0, NULL);
- if (ret)
- return ret;
-
-#ifdef CONFIG_SECURITY
- if (!fs_info->security_opts.num_mnt_opts) {
- /* first time security setup, copy sec_opts to fs_info */
- memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts));
- } else {
- /*
- * Since SELinux (the only one supporting security_mnt_opts)
- * does NOT support changing context during remount/mount of
- * the same sb, this must be the same or part of the same
- * security options, just free it.
- */
- security_free_mnt_opts(sec_opts);
- }
-#endif
- return ret;
-}
-
/*
* Find a superblock for the given device / mount point.
*
@@ -1522,16 +1472,15 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
struct btrfs_device *device = NULL;
struct btrfs_fs_devices *fs_devices = NULL;
struct btrfs_fs_info *fs_info = NULL;
- struct security_mnt_opts new_sec_opts;
+ void *new_sec_opts = NULL;
fmode_t mode = FMODE_READ;
int error = 0;
if (!(flags & SB_RDONLY))
mode |= FMODE_WRITE;
- security_init_mnt_opts(&new_sec_opts);
if (data) {
- error = parse_security_options(data, &new_sec_opts);
+ error = security_sb_eat_lsm_opts(data, &new_sec_opts);
if (error)
return ERR_PTR(error);
}
@@ -1550,7 +1499,6 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
- security_init_mnt_opts(&fs_info->security_opts);
if (!fs_info->super_copy || !fs_info->super_for_commit) {
error = -ENOMEM;
goto error_fs_info;
@@ -1601,16 +1549,12 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
btrfs_sb(s)->bdev_holder = fs_type;
error = btrfs_fill_super(s, fs_devices, data);
}
+ if (!error)
+ error = security_sb_set_mnt_opts(s, new_sec_opts, 0, NULL);
+ security_free_mnt_opts(&new_sec_opts);
if (error) {
deactivate_locked_super(s);
- goto error_sec_opts;
- }
-
- fs_info = btrfs_sb(s);
- error = setup_security_options(fs_info, s, &new_sec_opts);
- if (error) {
- deactivate_locked_super(s);
- goto error_sec_opts;
+ return ERR_PTR(error);
}
return dget(s->s_root);
@@ -1779,18 +1723,14 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
btrfs_remount_prepare(fs_info);
if (data) {
- struct security_mnt_opts new_sec_opts;
+ void *new_sec_opts = NULL;
- security_init_mnt_opts(&new_sec_opts);
- ret = parse_security_options(data, &new_sec_opts);
+ ret = security_sb_eat_lsm_opts(data, &new_sec_opts);
+ if (!ret)
+ ret = security_sb_remount(sb, new_sec_opts);
+ security_free_mnt_opts(&new_sec_opts);
if (ret)
goto restore;
- ret = setup_security_options(fs_info, sb,
- &new_sec_opts);
- if (ret) {
- security_free_mnt_opts(&new_sec_opts);
- goto restore;
- }
}
ret = btrfs_parse_options(fs_info, data, *flags);
@@ -1848,7 +1788,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
if (!btrfs_check_rw_degradable(fs_info, NULL)) {
btrfs_warn(fs_info,
- "too many missing devices, writeable remount is not allowed");
+ "too many missing devices, writable remount is not allowed");
ret = -EACCES;
goto restore;
}
@@ -2090,7 +2030,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
u64 total_free_data = 0;
u64 total_free_meta = 0;
int bits = dentry->d_sb->s_blocksize_bits;
- __be32 *fsid = (__be32 *)fs_info->fsid;
+ __be32 *fsid = (__be32 *)fs_info->fs_devices->fsid;
unsigned factor = 1;
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
int ret;
@@ -2237,6 +2177,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
vol = memdup_user((void __user *)arg, sizeof(*vol));
if (IS_ERR(vol))
return PTR_ERR(vol);
+ vol->name[BTRFS_PATH_NAME_MAX] = '\0';
switch (cmd) {
case BTRFS_IOC_SCAN_DEV:
@@ -2311,7 +2252,7 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
* device_list_mutex here as we only read the device data and the list
* is protected by RCU. Even if a device is deleted during the list
* traversals, we'll get valid data, the freeing callback will wait at
- * least until until the rcu_read_unlock.
+ * least until the rcu_read_unlock.
*/
rcu_read_lock();
cur_devices = fs_info->fs_devices;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 3717c86..5a5930e 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -191,6 +191,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF);
BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56);
BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);
BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
+BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
static struct attribute *btrfs_supported_feature_attrs[] = {
@@ -204,6 +205,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(raid56),
BTRFS_FEAT_ATTR_PTR(skinny_metadata),
BTRFS_FEAT_ATTR_PTR(no_holes),
+ BTRFS_FEAT_ATTR_PTR(metadata_uuid),
BTRFS_FEAT_ATTR_PTR(free_space_tree),
NULL
};
@@ -505,12 +507,24 @@ static ssize_t quota_override_store(struct kobject *kobj,
BTRFS_ATTR_RW(, quota_override, quota_override_show, quota_override_store);
+static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return snprintf(buf, PAGE_SIZE, "%pU\n",
+ fs_info->fs_devices->metadata_uuid);
+}
+
+BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show);
+
static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, label),
BTRFS_ATTR_PTR(, nodesize),
BTRFS_ATTR_PTR(, sectorsize),
BTRFS_ATTR_PTR(, clone_alignment),
BTRFS_ATTR_PTR(, quota_override),
+ BTRFS_ATTR_PTR(, metadata_uuid),
NULL,
};
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index c6ee600..40716b3 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -9,7 +9,7 @@
extern u64 btrfs_debugfs_test;
enum btrfs_feature_set {
- FEAT_COMPAT = 0,
+ FEAT_COMPAT,
FEAT_COMPAT_RO,
FEAT_INCOMPAT,
FEAT_MAX
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index db72b3b..8a59597 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -174,8 +174,10 @@ void btrfs_free_dummy_root(struct btrfs_root *root)
/* Will be freed by btrfs_free_fs_roots */
if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
return;
- if (root->node)
+ if (root->node) {
+ /* One for allocate_extent_buffer */
free_extent_buffer(root->node);
+ }
kfree(root);
}
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 9e0f4a0..3c46d7f 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -62,10 +62,11 @@ static int test_find_delalloc(u32 sectorsize)
struct page *page;
struct page *locked_page = NULL;
unsigned long index = 0;
- u64 total_dirty = SZ_256M;
- u64 max_bytes = SZ_128M;
+ /* In this test we need at least 2 file extents at its maximum size */
+ u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
+ u64 total_dirty = 2 * max_bytes;
u64 start, end, test_start;
- u64 found;
+ bool found;
int ret = -EINVAL;
test_msg("running find delalloc tests");
@@ -76,7 +77,7 @@ static int test_find_delalloc(u32 sectorsize)
return -ENOMEM;
}
- extent_io_tree_init(&tmp, inode);
+ extent_io_tree_init(&tmp, NULL);
/*
* First go through and create and mark all of our pages dirty, we pin
@@ -106,8 +107,8 @@ static int test_find_delalloc(u32 sectorsize)
set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL);
start = 0;
end = 0;
- found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
- &end, max_bytes);
+ found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ &end);
if (!found) {
test_err("should have found at least one delalloc");
goto out_bits;
@@ -137,8 +138,8 @@ static int test_find_delalloc(u32 sectorsize)
set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL);
start = test_start;
end = 0;
- found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
- &end, max_bytes);
+ found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ &end);
if (!found) {
test_err("couldn't find delalloc in our range");
goto out_bits;
@@ -171,8 +172,8 @@ static int test_find_delalloc(u32 sectorsize)
}
start = test_start;
end = 0;
- found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
- &end, max_bytes);
+ found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ &end);
if (found) {
test_err("found range when we shouldn't have");
goto out_bits;
@@ -192,8 +193,8 @@ static int test_find_delalloc(u32 sectorsize)
set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL);
start = test_start;
end = 0;
- found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
- &end, max_bytes);
+ found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ &end);
if (!found) {
test_err("didn't find our range");
goto out_bits;
@@ -233,8 +234,8 @@ static int test_find_delalloc(u32 sectorsize)
* this changes at any point in the future we will need to fix this
* tests expected behavior.
*/
- found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
- &end, max_bytes);
+ found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ &end);
if (!found) {
test_err("didn't find our range");
goto out_bits;
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 64043f0..af0c8e3 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -254,11 +254,6 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
- /*
- * We will just free a dummy node if it's ref count is 2 so we need an
- * extra ref so our searches don't accidentally release our page.
- */
- extent_buffer_get(root->node);
btrfs_set_header_nritems(root->node, 0);
btrfs_set_header_level(root->node, 0);
ret = -EINVAL;
@@ -860,7 +855,6 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
goto out;
}
- extent_buffer_get(root->node);
btrfs_set_header_nritems(root->node, 0);
btrfs_set_header_level(root->node, 0);
BTRFS_I(inode)->root = root;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index d1eeef9..127fa15 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -233,14 +233,12 @@ loop:
extwriter_counter_init(cur_trans, type);
init_waitqueue_head(&cur_trans->writer_wait);
init_waitqueue_head(&cur_trans->commit_wait);
- init_waitqueue_head(&cur_trans->pending_wait);
cur_trans->state = TRANS_STATE_RUNNING;
/*
* One for this trans handle, one so it will live on until we
* commit the transaction.
*/
refcount_set(&cur_trans->use_count, 2);
- atomic_set(&cur_trans->pending_ordered, 0);
cur_trans->flags = 0;
cur_trans->start_time = ktime_get_seconds();
@@ -456,7 +454,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
bool enforce_qgroups)
{
struct btrfs_fs_info *fs_info = root->fs_info;
-
+ struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
struct btrfs_trans_handle *h;
struct btrfs_transaction *cur_trans;
u64 num_bytes = 0;
@@ -485,13 +483,28 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
* the appropriate flushing if need be.
*/
if (num_items && root != fs_info->chunk_root) {
+ struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv;
+ u64 delayed_refs_bytes = 0;
+
qgroup_reserved = num_items * fs_info->nodesize;
ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved,
enforce_qgroups);
if (ret)
return ERR_PTR(ret);
+ /*
+ * We want to reserve all the bytes we may need all at once, so
+ * we only do 1 enospc flushing cycle per transaction start. We
+ * accomplish this by simply assuming we'll do 2 x num_items
+ * worth of delayed refs updates in this trans handle, and
+ * refill that amount for whatever is missing in the reserve.
+ */
num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items);
+ if (delayed_refs_rsv->full == 0) {
+ delayed_refs_bytes = num_bytes;
+ num_bytes <<= 1;
+ }
+
/*
* Do the reservation for the relocation root creation
*/
@@ -500,8 +513,24 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
reloc_reserved = true;
}
- ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv,
- num_bytes, flush);
+ ret = btrfs_block_rsv_add(root, rsv, num_bytes, flush);
+ if (ret)
+ goto reserve_fail;
+ if (delayed_refs_bytes) {
+ btrfs_migrate_to_delayed_refs_rsv(fs_info, rsv,
+ delayed_refs_bytes);
+ num_bytes -= delayed_refs_bytes;
+ }
+ } else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL &&
+ !delayed_refs_rsv->full) {
+ /*
+ * Some people call with btrfs_start_transaction(root, 0)
+ * because they can be throttled, but have some other mechanism
+ * for reserving space. We still want these guys to refill the
+ * delayed block_rsv so just add 1 items worth of reservation
+ * here.
+ */
+ ret = btrfs_delayed_refs_rsv_refill(fs_info, flush);
if (ret)
goto reserve_fail;
}
@@ -670,7 +699,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
/*
* btrfs_attach_transaction_barrier() - catch the running transaction
*
- * It is similar to the above function, the differentia is this one
+ * It is similar to the above function, the difference is this one
* will wait for all the inactive transactions until they fully
* complete.
*/
@@ -760,7 +789,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- if (btrfs_check_space_for_delayed_refs(trans))
+ if (btrfs_check_space_for_delayed_refs(fs_info))
return 1;
return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);
@@ -769,22 +798,12 @@ static int should_end_transaction(struct btrfs_trans_handle *trans)
int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_transaction *cur_trans = trans->transaction;
- int updates;
- int err;
smp_mb();
if (cur_trans->state >= TRANS_STATE_BLOCKED ||
cur_trans->delayed_refs.flushing)
return 1;
- updates = trans->delayed_ref_updates;
- trans->delayed_ref_updates = 0;
- if (updates) {
- err = btrfs_run_delayed_refs(trans, updates * 2);
- if (err) /* Error code will also eval true */
- return err;
- }
-
return should_end_transaction(trans);
}
@@ -814,11 +833,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
- u64 transid = trans->transid;
- unsigned long cur = trans->delayed_ref_updates;
int lock = (trans->type != TRANS_JOIN_NOLOCK);
int err = 0;
- int must_run_delayed_refs = 0;
if (refcount_read(&trans->use_count) > 1) {
refcount_dec(&trans->use_count);
@@ -832,27 +848,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans);
- trans->delayed_ref_updates = 0;
- if (!trans->sync) {
- must_run_delayed_refs =
- btrfs_should_throttle_delayed_refs(trans);
- cur = max_t(unsigned long, cur, 32);
-
- /*
- * don't make the caller wait if they are from a NOLOCK
- * or ATTACH transaction, it will deadlock with commit
- */
- if (must_run_delayed_refs == 1 &&
- (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH)))
- must_run_delayed_refs = 2;
- }
-
- btrfs_trans_release_metadata(trans);
- trans->block_rsv = NULL;
-
- if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans);
-
btrfs_trans_release_chunk_metadata(trans);
if (lock && should_end_transaction(trans) &&
@@ -894,10 +889,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
}
kmem_cache_free(btrfs_trans_handle_cachep, trans);
- if (must_run_delayed_refs) {
- btrfs_async_run_delayed_refs(info, cur, transid,
- must_run_delayed_refs == 1);
- }
return err;
}
@@ -1338,7 +1329,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
return 0;
/*
- * Ensure dirty @src will be commited. Or, after comming
+ * Ensure dirty @src will be committed. Or, after coming
* commit_fs_roots() and switch_commit_roots(), any dirty but not
* recorded root will never be updated again, causing an outdated root
* item.
@@ -1842,7 +1833,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
- DEFINE_WAIT(wait);
WARN_ON(refcount_read(&trans->use_count) > 1);
@@ -1911,13 +1901,6 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
}
-static inline void
-btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans)
-{
- wait_event(cur_trans->pending_wait,
- atomic_read(&cur_trans->pending_ordered) == 0);
-}
-
int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -2052,8 +2035,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
btrfs_wait_delalloc_flush(fs_info);
- btrfs_wait_pending_ordered(cur_trans);
-
btrfs_scrub_pause(fs_info);
/*
* Ok now we need to make sure to block out any other joins while we
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 4cbb1b5..f1ba789 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -12,13 +12,13 @@
#include "ctree.h"
enum btrfs_trans_state {
- TRANS_STATE_RUNNING = 0,
- TRANS_STATE_BLOCKED = 1,
- TRANS_STATE_COMMIT_START = 2,
- TRANS_STATE_COMMIT_DOING = 3,
- TRANS_STATE_UNBLOCKED = 4,
- TRANS_STATE_COMPLETED = 5,
- TRANS_STATE_MAX = 6,
+ TRANS_STATE_RUNNING,
+ TRANS_STATE_BLOCKED,
+ TRANS_STATE_COMMIT_START,
+ TRANS_STATE_COMMIT_DOING,
+ TRANS_STATE_UNBLOCKED,
+ TRANS_STATE_COMPLETED,
+ TRANS_STATE_MAX,
};
#define BTRFS_TRANS_HAVE_FREE_BGS 0
@@ -39,7 +39,6 @@ struct btrfs_transaction {
*/
atomic_t num_writers;
refcount_t use_count;
- atomic_t pending_ordered;
unsigned long flags;
@@ -51,7 +50,6 @@ struct btrfs_transaction {
time64_t start_time;
wait_queue_head_t writer_wait;
wait_queue_head_t commit_wait;
- wait_queue_head_t pending_wait;
struct list_head pending_snapshots;
struct list_head pending_chunks;
struct list_head switch_commits;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index efcf89a8..a62e1e8 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -27,10 +27,10 @@
*
* @type: leaf or node
* @identifier: the necessary info to locate the leaf/node.
- * It's recommened to decode key.objecitd/offset if it's
+ * It's recommended to decode key.objecitd/offset if it's
* meaningful.
* @reason: describe the error
- * @bad_value: optional, it's recommened to output bad value and its
+ * @bad_value: optional, it's recommended to output bad value and its
* expected value (range).
*
* Since comma is used to separate the components, only space is allowed
@@ -130,7 +130,7 @@ static int check_extent_data_item(struct btrfs_fs_info *fs_info,
}
/*
- * Support for new compression/encrption must introduce incompat flag,
+ * Support for new compression/encryption must introduce incompat flag,
* and must be caught in open_ctree().
*/
if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) {
@@ -389,13 +389,11 @@ static int check_block_group_item(struct btrfs_fs_info *fs_info,
/*
* Here we don't really care about alignment since extent allocator can
- * handle it. We care more about the size, as if one block group is
- * larger than maximum size, it's must be some obvious corruption.
+ * handle it. We care more about the size.
*/
- if (key->offset > BTRFS_MAX_DATA_CHUNK_SIZE || key->offset == 0) {
+ if (key->offset == 0) {
block_group_err(fs_info, leaf, slot,
- "invalid block group size, have %llu expect (0, %llu]",
- key->offset, BTRFS_MAX_DATA_CHUNK_SIZE);
+ "invalid block group size 0");
return -EUCLEAN;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index a5ce99a..ac232b3 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1144,7 +1144,7 @@ next:
}
btrfs_release_path(path);
- /* look for a conflicing name */
+ /* look for a conflicting name */
di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
name, namelen, 0);
if (di && !IS_ERR(di)) {
@@ -3149,7 +3149,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&log_root_tree->log_mutex);
/*
- * nobody else is going to jump in and write the the ctree
+ * Nobody else is going to jump in and write the ctree
* super here because the log_commit atomic below is protecting
* us. We must be called with a transaction handle pinning
* the running transaction open, so a full commit can't hop
@@ -3201,8 +3201,6 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *log)
{
int ret;
- u64 start;
- u64 end;
struct walk_control wc = {
.free = 1,
.process_func = process_one_buffer
@@ -3216,18 +3214,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
btrfs_handle_fs_error(log->fs_info, ret, NULL);
}
- while (1) {
- ret = find_first_extent_bit(&log->dirty_log_pages,
- 0, &start, &end,
- EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT,
- NULL);
- if (ret)
- break;
-
- clear_extent_bits(&log->dirty_log_pages, start, end,
- EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
- }
-
+ clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
+ EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
free_extent_buffer(log->node);
kfree(log);
}
@@ -4383,7 +4371,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct extent_map *em, *n;
struct list_head extents;
struct extent_map_tree *tree = &inode->extent_tree;
- u64 logged_start, logged_end;
u64 test_gen;
int ret = 0;
int num = 0;
@@ -4392,8 +4379,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
write_lock(&tree->lock);
test_gen = root->fs_info->last_trans_committed;
- logged_start = start;
- logged_end = end;
list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
/*
@@ -4434,11 +4419,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
em->start >= i_size_read(&inode->vfs_inode))
continue;
- if (em->start < logged_start)
- logged_start = em->start;
- if ((em->start + em->len - 1) > logged_end)
- logged_end = em->start + em->len - 1;
-
/* Need a ref to keep it from getting evicted from cache */
refcount_inc(&em->refs);
set_bit(EXTENT_FLAG_LOGGING, &em->flags);
@@ -5778,6 +5758,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_trans;
}
+ /*
+ * If a new hard link was added to the inode in the current transaction
+ * and its link count is now greater than 1, we need to fallback to a
+ * transaction commit, otherwise we can end up not logging all its new
+ * parents for all the hard links. Here just from the dentry used to
+ * fsync, we can not visit the ancestor inodes for all the other hard
+ * links to figure out if any is new, so we fallback to a transaction
+ * commit (instead of adding a lot of complexity of scanning a btree,
+ * since this scenario is not a common use case).
+ */
+ if (inode->vfs_inode.i_nlink > 1 &&
+ inode->last_link_trans > last_committed) {
+ ret = -EMLINK;
+ goto end_trans;
+ }
+
while (1) {
if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
break;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 7677650..0fab84a 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -15,7 +15,6 @@
struct btrfs_log_ctx {
int log_ret;
int log_transid;
- int io_err;
bool log_new_dentries;
struct inode *inode;
struct list_head list;
@@ -26,7 +25,6 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
{
ctx->log_ret = 0;
ctx->log_transid = 0;
- ctx->io_err = 0;
ctx->log_new_dentries = false;
ctx->inode = inode;
INIT_LIST_HEAD(&ctx->list);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f435d39..2576b1a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -37,6 +37,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 1,
.devs_increment = 2,
.ncopies = 2,
+ .nparity = 0,
.raid_name = "raid10",
.bg_flag = BTRFS_BLOCK_GROUP_RAID10,
.mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
@@ -49,6 +50,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 1,
.devs_increment = 2,
.ncopies = 2,
+ .nparity = 0,
.raid_name = "raid1",
.bg_flag = BTRFS_BLOCK_GROUP_RAID1,
.mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
@@ -61,6 +63,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 0,
.devs_increment = 1,
.ncopies = 2,
+ .nparity = 0,
.raid_name = "dup",
.bg_flag = BTRFS_BLOCK_GROUP_DUP,
.mindev_error = 0,
@@ -73,6 +76,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 0,
.devs_increment = 1,
.ncopies = 1,
+ .nparity = 0,
.raid_name = "raid0",
.bg_flag = BTRFS_BLOCK_GROUP_RAID0,
.mindev_error = 0,
@@ -85,6 +89,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 0,
.devs_increment = 1,
.ncopies = 1,
+ .nparity = 0,
.raid_name = "single",
.bg_flag = 0,
.mindev_error = 0,
@@ -96,7 +101,8 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.devs_min = 2,
.tolerated_failures = 1,
.devs_increment = 1,
- .ncopies = 2,
+ .ncopies = 1,
+ .nparity = 1,
.raid_name = "raid5",
.bg_flag = BTRFS_BLOCK_GROUP_RAID5,
.mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
@@ -108,7 +114,8 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.devs_min = 3,
.tolerated_failures = 2,
.devs_increment = 1,
- .ncopies = 3,
+ .ncopies = 1,
+ .nparity = 2,
.raid_name = "raid6",
.bg_flag = BTRFS_BLOCK_GROUP_RAID6,
.mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
@@ -123,6 +130,60 @@ const char *get_raid_name(enum btrfs_raid_types type)
return btrfs_raid_array[type].raid_name;
}
+/*
+ * Fill @buf with textual description of @bg_flags, no more than @size_buf
+ * bytes including terminating null byte.
+ */
+void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
+{
+ int i;
+ int ret;
+ char *bp = buf;
+ u64 flags = bg_flags;
+ u32 size_bp = size_buf;
+
+ if (!flags) {
+ strcpy(bp, "NONE");
+ return;
+ }
+
+#define DESCRIBE_FLAG(flag, desc) \
+ do { \
+ if (flags & (flag)) { \
+ ret = snprintf(bp, size_bp, "%s|", (desc)); \
+ if (ret < 0 || ret >= size_bp) \
+ goto out_overflow; \
+ size_bp -= ret; \
+ bp += ret; \
+ flags &= ~(flag); \
+ } \
+ } while (0)
+
+ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
+ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
+ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
+
+ DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+ DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
+ btrfs_raid_array[i].raid_name);
+#undef DESCRIBE_FLAG
+
+ if (flags) {
+ ret = snprintf(bp, size_bp, "0x%llx|", flags);
+ size_bp -= ret;
+ }
+
+ if (size_bp < size_buf)
+ buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
+
+ /*
+ * The text is trimmed, it's up to the caller to provide sufficiently
+ * large buffer
+ */
+out_overflow:;
+}
+
static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
@@ -151,7 +212,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* the mutex can be very coarse and can cover long-running operations
*
* protects: updates to fs_devices counters like missing devices, rw devices,
- * seeding, structure cloning, openning/closing devices at mount/umount time
+ * seeding, structure cloning, opening/closing devices at mount/umount time
*
* global::fs_devs - add, remove, updates to the global list
*
@@ -238,13 +299,15 @@ struct list_head *btrfs_get_fs_uuids(void)
/*
* alloc_fs_devices - allocate struct btrfs_fs_devices
- * @fsid: if not NULL, copy the uuid to fs_devices::fsid
+ * @fsid: if not NULL, copy the UUID to fs_devices::fsid
+ * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid
*
* Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
* The returned struct is not linked onto any lists and can be destroyed with
* kfree() right away.
*/
-static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
+static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
+ const u8 *metadata_fsid)
{
struct btrfs_fs_devices *fs_devs;
@@ -261,6 +324,11 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
if (fsid)
memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
+ if (metadata_fsid)
+ memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
+ else if (fsid)
+ memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
+
return fs_devs;
}
@@ -368,13 +436,57 @@ static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
return NULL;
}
-static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
+static noinline struct btrfs_fs_devices *find_fsid(
+ const u8 *fsid, const u8 *metadata_fsid)
{
struct btrfs_fs_devices *fs_devices;
+ ASSERT(fsid);
+
+ if (metadata_fsid) {
+ /*
+ * Handle scanned device having completed its fsid change but
+ * belonging to a fs_devices that was created by first scanning
+ * a device which didn't have its fsid/metadata_uuid changed
+ * at all and the CHANGING_FSID_V2 flag set.
+ */
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ if (fs_devices->fsid_change &&
+ memcmp(metadata_fsid, fs_devices->fsid,
+ BTRFS_FSID_SIZE) == 0 &&
+ memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
+ BTRFS_FSID_SIZE) == 0) {
+ return fs_devices;
+ }
+ }
+ /*
+ * Handle scanned device having completed its fsid change but
+ * belonging to a fs_devices that was created by a device that
+ * has an outdated pair of fsid/metadata_uuid and
+ * CHANGING_FSID_V2 flag set.
+ */
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ if (fs_devices->fsid_change &&
+ memcmp(fs_devices->metadata_uuid,
+ fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
+ memcmp(metadata_fsid, fs_devices->metadata_uuid,
+ BTRFS_FSID_SIZE) == 0) {
+ return fs_devices;
+ }
+ }
+ }
+
+ /* Handle non-split brain cases */
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
- return fs_devices;
+ if (metadata_fsid) {
+ if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
+ && memcmp(metadata_fsid, fs_devices->metadata_uuid,
+ BTRFS_FSID_SIZE) == 0)
+ return fs_devices;
+ } else {
+ if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
+ return fs_devices;
+ }
}
return NULL;
}
@@ -709,6 +821,13 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
device->generation = btrfs_super_generation(disk_super);
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
+ if (btrfs_super_incompat_flags(disk_super) &
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
+ pr_err(
+ "BTRFS: Invalid seeding and uuid-changed device detected\n");
+ goto error_brelse;
+ }
+
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
fs_devices->seeding = 1;
} else {
@@ -744,6 +863,51 @@ error_brelse:
}
/*
+ * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
+ * being created with a disk that has already completed its fsid change.
+ */
+static struct btrfs_fs_devices *find_fsid_inprogress(
+ struct btrfs_super_block *disk_super)
+{
+ struct btrfs_fs_devices *fs_devices;
+
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
+ BTRFS_FSID_SIZE) != 0 &&
+ memcmp(fs_devices->metadata_uuid, disk_super->fsid,
+ BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
+ return fs_devices;
+ }
+ }
+
+ return NULL;
+}
+
+
+static struct btrfs_fs_devices *find_fsid_changed(
+ struct btrfs_super_block *disk_super)
+{
+ struct btrfs_fs_devices *fs_devices;
+
+ /*
+ * Handles the case where scanned device is part of an fs that had
+ * multiple successful changes of FSID but curently device didn't
+ * observe it. Meaning our fsid will be different than theirs.
+ */
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
+ BTRFS_FSID_SIZE) != 0 &&
+ memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
+ BTRFS_FSID_SIZE) == 0 &&
+ memcmp(fs_devices->fsid, disk_super->fsid,
+ BTRFS_FSID_SIZE) != 0) {
+ return fs_devices;
+ }
+ }
+
+ return NULL;
+}
+/*
* Add new device to list of registered devices
*
* Returns:
@@ -755,14 +919,46 @@ static noinline struct btrfs_device *device_list_add(const char *path,
bool *new_device_added)
{
struct btrfs_device *device;
- struct btrfs_fs_devices *fs_devices;
+ struct btrfs_fs_devices *fs_devices = NULL;
struct rcu_string *name;
u64 found_transid = btrfs_super_generation(disk_super);
u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
+ bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
+ bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
+ BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
+
+ if (fsid_change_in_progress) {
+ if (!has_metadata_uuid) {
+ /*
+ * When we have an image which has CHANGING_FSID_V2 set
+ * it might belong to either a filesystem which has
+ * disks with completed fsid change or it might belong
+ * to fs with no UUID changes in effect, handle both.
+ */
+ fs_devices = find_fsid_inprogress(disk_super);
+ if (!fs_devices)
+ fs_devices = find_fsid(disk_super->fsid, NULL);
+ } else {
+ fs_devices = find_fsid_changed(disk_super);
+ }
+ } else if (has_metadata_uuid) {
+ fs_devices = find_fsid(disk_super->fsid,
+ disk_super->metadata_uuid);
+ } else {
+ fs_devices = find_fsid(disk_super->fsid, NULL);
+ }
+
- fs_devices = find_fsid(disk_super->fsid);
if (!fs_devices) {
- fs_devices = alloc_fs_devices(disk_super->fsid);
+ if (has_metadata_uuid)
+ fs_devices = alloc_fs_devices(disk_super->fsid,
+ disk_super->metadata_uuid);
+ else
+ fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
+
+ fs_devices->fsid_change = fsid_change_in_progress;
+
if (IS_ERR(fs_devices))
return ERR_CAST(fs_devices);
@@ -774,6 +970,21 @@ static noinline struct btrfs_device *device_list_add(const char *path,
mutex_lock(&fs_devices->device_list_mutex);
device = find_device(fs_devices, devid,
disk_super->dev_item.uuid);
+
+ /*
+ * If this disk has been pulled into an fs devices created by
+ * a device which had the CHANGING_FSID_V2 flag then replace the
+ * metadata_uuid/fsid values of the fs_devices.
+ */
+ if (has_metadata_uuid && fs_devices->fsid_change &&
+ found_transid > fs_devices->latest_generation) {
+ memcpy(fs_devices->fsid, disk_super->fsid,
+ BTRFS_FSID_SIZE);
+ memcpy(fs_devices->metadata_uuid,
+ disk_super->metadata_uuid, BTRFS_FSID_SIZE);
+
+ fs_devices->fsid_change = false;
+ }
}
if (!device) {
@@ -850,6 +1061,35 @@ static noinline struct btrfs_device *device_list_add(const char *path,
return ERR_PTR(-EEXIST);
}
+ /*
+ * We are going to replace the device path for a given devid,
+ * make sure it's the same device if the device is mounted
+ */
+ if (device->bdev) {
+ struct block_device *path_bdev;
+
+ path_bdev = lookup_bdev(path);
+ if (IS_ERR(path_bdev)) {
+ mutex_unlock(&fs_devices->device_list_mutex);
+ return ERR_CAST(path_bdev);
+ }
+
+ if (device->bdev != path_bdev) {
+ bdput(path_bdev);
+ mutex_unlock(&fs_devices->device_list_mutex);
+ btrfs_warn_in_rcu(device->fs_info,
+ "duplicate device fsid:devid for %pU:%llu old:%s new:%s",
+ disk_super->fsid, devid,
+ rcu_str_deref(device->name), path);
+ return ERR_PTR(-EEXIST);
+ }
+ bdput(path_bdev);
+ btrfs_info_in_rcu(device->fs_info,
+ "device fsid %pU devid %llu moved old:%s new:%s",
+ disk_super->fsid, devid,
+ rcu_str_deref(device->name), path);
+ }
+
name = rcu_string_strdup(path, GFP_NOFS);
if (!name) {
mutex_unlock(&fs_devices->device_list_mutex);
@@ -869,8 +1109,11 @@ static noinline struct btrfs_device *device_list_add(const char *path,
* it back. We need it to pick the disk with largest generation
* (as above).
*/
- if (!fs_devices->opened)
+ if (!fs_devices->opened) {
device->generation = found_transid;
+ fs_devices->latest_generation = max_t(u64, found_transid,
+ fs_devices->latest_generation);
+ }
fs_devices->total_devices = btrfs_super_num_devices(disk_super);
@@ -884,7 +1127,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
struct btrfs_device *device;
struct btrfs_device *orig_dev;
- fs_devices = alloc_fs_devices(orig->fsid);
+ fs_devices = alloc_fs_devices(orig->fsid, NULL);
if (IS_ERR(fs_devices))
return fs_devices;
@@ -1193,7 +1436,7 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
p = kmap(*page);
/* align our pointer to the offset of the super block */
- *disk_super = p + (bytenr & ~PAGE_MASK);
+ *disk_super = p + offset_in_page(bytenr);
if (btrfs_super_bytenr(*disk_super) != bytenr ||
btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
@@ -1709,7 +1952,8 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
ptr = btrfs_device_uuid(dev_item);
write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
ptr = btrfs_device_fsid(dev_item);
- write_extent_buffer(leaf, trans->fs_info->fsid, ptr, BTRFS_FSID_SIZE);
+ write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
+ ptr, BTRFS_FSID_SIZE);
btrfs_mark_buffer_dirty(leaf);
ret = 0;
@@ -1862,12 +2106,12 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
{
u64 num_devices = fs_info->fs_devices->num_devices;
- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
+ down_read(&fs_info->dev_replace.rwsem);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
ASSERT(num_devices > 1);
num_devices--;
}
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+ up_read(&fs_info->dev_replace.rwsem);
return num_devices;
}
@@ -1900,6 +2144,14 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
goto out;
}
+ if (btrfs_pinned_by_swapfile(fs_info, device)) {
+ btrfs_warn_in_rcu(fs_info,
+ "cannot remove device %s (devid %llu) due to active swapfile",
+ rcu_str_deref(device->name), device->devid);
+ ret = -ETXTBSY;
+ goto out;
+ }
+
if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
ret = BTRFS_ERROR_DEV_TGT_REPLACE;
goto out;
@@ -2132,7 +2384,13 @@ static struct btrfs_device *btrfs_find_device_by_path(
disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_uuid = disk_super->dev_item.uuid;
- device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
+ if (btrfs_fs_incompat(fs_info, METADATA_UUID))
+ device = btrfs_find_device(fs_info, devid, dev_uuid,
+ disk_super->metadata_uuid);
+ else
+ device = btrfs_find_device(fs_info, devid,
+ dev_uuid, disk_super->fsid);
+
brelse(bh);
if (!device)
device = ERR_PTR(-ENOENT);
@@ -2202,7 +2460,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
if (!fs_devices->seeding)
return -EINVAL;
- seed_devices = alloc_fs_devices(NULL);
+ seed_devices = alloc_fs_devices(NULL, NULL);
if (IS_ERR(seed_devices))
return PTR_ERR(seed_devices);
@@ -2238,7 +2496,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
fs_devices->seed = seed_devices;
generate_random_uuid(fs_devices->fsid);
- memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+ memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
mutex_unlock(&fs_devices->device_list_mutex);
@@ -2480,7 +2738,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
* so rename the fsid on the sysfs
*/
snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
- fs_info->fsid);
+ fs_info->fs_devices->fsid);
if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))
btrfs_warn(fs_info,
"sysfs: failed to create fsid for sprout");
@@ -2718,8 +2976,15 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
return ret;
}
-static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
- u64 logical, u64 length)
+/*
+ * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
+ * @logical: Logical block offset in bytes.
+ * @length: Length of extent in bytes.
+ *
+ * Return: Chunk mapping or ERR_PTR.
+ */
+struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
{
struct extent_map_tree *em_tree;
struct extent_map *em;
@@ -2756,7 +3021,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
int i, ret = 0;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- em = get_chunk_map(fs_info, chunk_offset, 1);
+ em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
if (IS_ERR(em)) {
/*
* This is a logic error, but we don't want to just rely on the
@@ -2797,13 +3062,11 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
mutex_unlock(&fs_info->chunk_mutex);
}
- if (map->stripes[i].dev) {
- ret = btrfs_update_device(trans, map->stripes[i].dev);
- if (ret) {
- mutex_unlock(&fs_devices->device_list_mutex);
- btrfs_abort_transaction(trans, ret);
- goto out;
- }
+ ret = btrfs_update_device(trans, device);
+ if (ret) {
+ mutex_unlock(&fs_devices->device_list_mutex);
+ btrfs_abort_transaction(trans, ret);
+ goto out;
}
}
mutex_unlock(&fs_devices->device_list_mutex);
@@ -3437,17 +3700,11 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
{
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
struct btrfs_root *chunk_root = fs_info->chunk_root;
- struct btrfs_root *dev_root = fs_info->dev_root;
- struct list_head *devices;
- struct btrfs_device *device;
- u64 old_size;
- u64 size_to_free;
u64 chunk_type;
struct btrfs_chunk *chunk;
struct btrfs_path *path = NULL;
struct btrfs_key key;
struct btrfs_key found_key;
- struct btrfs_trans_handle *trans;
struct extent_buffer *leaf;
int slot;
int ret;
@@ -3462,53 +3719,6 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
u32 count_sys = 0;
int chunk_reserved = 0;
- /* step one make some room on all the devices */
- devices = &fs_info->fs_devices->devices;
- list_for_each_entry(device, devices, dev_list) {
- old_size = btrfs_device_get_total_bytes(device);
- size_to_free = div_factor(old_size, 1);
- size_to_free = min_t(u64, size_to_free, SZ_1M);
- if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) ||
- btrfs_device_get_total_bytes(device) -
- btrfs_device_get_bytes_used(device) > size_to_free ||
- test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
- continue;
-
- ret = btrfs_shrink_device(device, old_size - size_to_free);
- if (ret == -ENOSPC)
- break;
- if (ret) {
- /* btrfs_shrink_device never returns ret > 0 */
- WARN_ON(ret > 0);
- goto error;
- }
-
- trans = btrfs_start_transaction(dev_root, 0);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- btrfs_info_in_rcu(fs_info,
- "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu",
- rcu_str_deref(device->name), ret,
- old_size, old_size - size_to_free);
- goto error;
- }
-
- ret = btrfs_grow_device(trans, device, old_size);
- if (ret) {
- btrfs_end_transaction(trans);
- /* btrfs_grow_device never returns ret > 0 */
- WARN_ON(ret > 0);
- btrfs_info_in_rcu(fs_info,
- "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu",
- rcu_str_deref(device->name), ret,
- old_size, old_size - size_to_free);
- goto error;
- }
-
- btrfs_end_transaction(trans);
- }
-
- /* step two, relocate all the chunks */
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
@@ -3638,10 +3848,15 @@ again:
ret = btrfs_relocate_chunk(fs_info, found_key.offset);
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
- if (ret && ret != -ENOSPC)
- goto error;
if (ret == -ENOSPC) {
enospc_errors++;
+ } else if (ret == -ETXTBSY) {
+ btrfs_info(fs_info,
+ "skipping relocation of block group %llu due to active swapfile",
+ found_key.offset);
+ ret = 0;
+ } else if (ret) {
+ goto error;
} else {
spin_lock(&fs_info->balance_lock);
bctl->stat.completed++;
@@ -3712,6 +3927,162 @@ static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
}
/*
+ * Fill @buf with textual description of balance filter flags @bargs, up to
+ * @size_buf including the terminating null. The output may be trimmed if it
+ * does not fit into the provided buffer.
+ */
+static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
+ u32 size_buf)
+{
+ int ret;
+ u32 size_bp = size_buf;
+ char *bp = buf;
+ u64 flags = bargs->flags;
+ char tmp_buf[128] = {'\0'};
+
+ if (!flags)
+ return;
+
+#define CHECK_APPEND_NOARG(a) \
+ do { \
+ ret = snprintf(bp, size_bp, (a)); \
+ if (ret < 0 || ret >= size_bp) \
+ goto out_overflow; \
+ size_bp -= ret; \
+ bp += ret; \
+ } while (0)
+
+#define CHECK_APPEND_1ARG(a, v1) \
+ do { \
+ ret = snprintf(bp, size_bp, (a), (v1)); \
+ if (ret < 0 || ret >= size_bp) \
+ goto out_overflow; \
+ size_bp -= ret; \
+ bp += ret; \
+ } while (0)
+
+#define CHECK_APPEND_2ARG(a, v1, v2) \
+ do { \
+ ret = snprintf(bp, size_bp, (a), (v1), (v2)); \
+ if (ret < 0 || ret >= size_bp) \
+ goto out_overflow; \
+ size_bp -= ret; \
+ bp += ret; \
+ } while (0)
+
+ if (flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ int index = btrfs_bg_flags_to_raid_index(bargs->target);
+
+ CHECK_APPEND_1ARG("convert=%s,", get_raid_name(index));
+ }
+
+ if (flags & BTRFS_BALANCE_ARGS_SOFT)
+ CHECK_APPEND_NOARG("soft,");
+
+ if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
+ btrfs_describe_block_groups(bargs->profiles, tmp_buf,
+ sizeof(tmp_buf));
+ CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
+ }
+
+ if (flags & BTRFS_BALANCE_ARGS_USAGE)
+ CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
+
+ if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
+ CHECK_APPEND_2ARG("usage=%u..%u,",
+ bargs->usage_min, bargs->usage_max);
+
+ if (flags & BTRFS_BALANCE_ARGS_DEVID)
+ CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
+
+ if (flags & BTRFS_BALANCE_ARGS_DRANGE)
+ CHECK_APPEND_2ARG("drange=%llu..%llu,",
+ bargs->pstart, bargs->pend);
+
+ if (flags & BTRFS_BALANCE_ARGS_VRANGE)
+ CHECK_APPEND_2ARG("vrange=%llu..%llu,",
+ bargs->vstart, bargs->vend);
+
+ if (flags & BTRFS_BALANCE_ARGS_LIMIT)
+ CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
+
+ if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
+ CHECK_APPEND_2ARG("limit=%u..%u,",
+ bargs->limit_min, bargs->limit_max);
+
+ if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
+ CHECK_APPEND_2ARG("stripes=%u..%u,",
+ bargs->stripes_min, bargs->stripes_max);
+
+#undef CHECK_APPEND_2ARG
+#undef CHECK_APPEND_1ARG
+#undef CHECK_APPEND_NOARG
+
+out_overflow:
+
+ if (size_bp < size_buf)
+ buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
+ else
+ buf[0] = '\0';
+}
+
+static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
+{
+ u32 size_buf = 1024;
+ char tmp_buf[192] = {'\0'};
+ char *buf;
+ char *bp;
+ u32 size_bp = size_buf;
+ int ret;
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+ buf = kzalloc(size_buf, GFP_KERNEL);
+ if (!buf)
+ return;
+
+ bp = buf;
+
+#define CHECK_APPEND_1ARG(a, v1) \
+ do { \
+ ret = snprintf(bp, size_bp, (a), (v1)); \
+ if (ret < 0 || ret >= size_bp) \
+ goto out_overflow; \
+ size_bp -= ret; \
+ bp += ret; \
+ } while (0)
+
+ if (bctl->flags & BTRFS_BALANCE_FORCE)
+ CHECK_APPEND_1ARG("%s", "-f ");
+
+ if (bctl->flags & BTRFS_BALANCE_DATA) {
+ describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
+ CHECK_APPEND_1ARG("-d%s ", tmp_buf);
+ }
+
+ if (bctl->flags & BTRFS_BALANCE_METADATA) {
+ describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
+ CHECK_APPEND_1ARG("-m%s ", tmp_buf);
+ }
+
+ if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
+ describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
+ CHECK_APPEND_1ARG("-s%s ", tmp_buf);
+ }
+
+#undef CHECK_APPEND_1ARG
+
+out_overflow:
+
+ if (size_bp < size_buf)
+ buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
+ btrfs_info(fs_info, "balance: %s %s",
+ (bctl->flags & BTRFS_BALANCE_RESUME) ?
+ "resume" : "start", buf);
+
+ kfree(buf);
+}
+
+/*
* Should be called with balance mutexe held
*/
int btrfs_balance(struct btrfs_fs_info *fs_info,
@@ -3724,6 +4095,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
int ret;
u64 num_devices;
unsigned seq;
+ bool reducing_integrity;
if (btrfs_fs_closing(fs_info) ||
atomic_read(&fs_info->balance_pause_req) ||
@@ -3803,24 +4175,30 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
!(bctl->sys.target & allowed)) ||
((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
(fs_info->avail_metadata_alloc_bits & allowed) &&
- !(bctl->meta.target & allowed))) {
- if (bctl->flags & BTRFS_BALANCE_FORCE) {
- btrfs_info(fs_info,
- "balance: force reducing metadata integrity");
- } else {
- btrfs_err(fs_info,
- "balance: reduces metadata integrity, use --force if you want this");
- ret = -EINVAL;
- goto out;
- }
- }
+ !(bctl->meta.target & allowed)))
+ reducing_integrity = true;
+ else
+ reducing_integrity = false;
+
+ /* if we're not converting, the target field is uninitialized */
+ meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
+ bctl->meta.target : fs_info->avail_metadata_alloc_bits;
+ data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
+ bctl->data.target : fs_info->avail_data_alloc_bits;
} while (read_seqretry(&fs_info->profiles_lock, seq));
- /* if we're not converting, the target field is uninitialized */
- meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
- bctl->meta.target : fs_info->avail_metadata_alloc_bits;
- data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
- bctl->data.target : fs_info->avail_data_alloc_bits;
+ if (reducing_integrity) {
+ if (bctl->flags & BTRFS_BALANCE_FORCE) {
+ btrfs_info(fs_info,
+ "balance: force reducing metadata integrity");
+ } else {
+ btrfs_err(fs_info,
+ "balance: reduces metadata integrity, use --force if you want this");
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
int meta_index = btrfs_bg_flags_to_raid_index(meta_target);
@@ -3850,11 +4228,19 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
+ describe_balance_start_or_resume(fs_info);
mutex_unlock(&fs_info->balance_mutex);
ret = __btrfs_balance(fs_info);
mutex_lock(&fs_info->balance_mutex);
+ if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
+ btrfs_info(fs_info, "balance: paused");
+ else if (ret == -ECANCELED && atomic_read(&fs_info->balance_cancel_req))
+ btrfs_info(fs_info, "balance: canceled");
+ else
+ btrfs_info(fs_info, "balance: ended with status: %d", ret);
+
clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
if (bargs) {
@@ -3887,10 +4273,8 @@ static int balance_kthread(void *data)
int ret = 0;
mutex_lock(&fs_info->balance_mutex);
- if (fs_info->balance_ctl) {
- btrfs_info(fs_info, "balance: resuming");
+ if (fs_info->balance_ctl)
ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
- }
mutex_unlock(&fs_info->balance_mutex);
return ret;
@@ -4433,10 +4817,16 @@ again:
ret = btrfs_relocate_chunk(fs_info, chunk_offset);
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
- if (ret && ret != -ENOSPC)
- goto done;
- if (ret == -ENOSPC)
+ if (ret == -ENOSPC) {
failed++;
+ } else if (ret) {
+ if (ret == -ETXTBSY) {
+ btrfs_warn(fs_info,
+ "could not shrink block group %llu due to active swapfile",
+ chunk_offset);
+ }
+ goto done;
+ }
} while (key.offset-- > 0);
if (failed && !retried) {
@@ -4602,11 +4992,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
int devs_min; /* min devs needed */
int devs_increment; /* ndevs has to be a multiple of this */
int ncopies; /* how many copies to data has */
+ int nparity; /* number of stripes worth of bytes to
+ store parity information */
int ret;
u64 max_stripe_size;
u64 max_chunk_size;
u64 stripe_size;
- u64 num_bytes;
+ u64 chunk_size;
int ndevs;
int i;
int j;
@@ -4628,6 +5020,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
devs_min = btrfs_raid_array[index].devs_min;
devs_increment = btrfs_raid_array[index].devs_increment;
ncopies = btrfs_raid_array[index].ncopies;
+ nparity = btrfs_raid_array[index].nparity;
if (type & BTRFS_BLOCK_GROUP_DATA) {
max_stripe_size = SZ_1G;
@@ -4654,7 +5047,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
BUG_ON(1);
}
- /* we don't want a chunk larger than 10% of writeable space */
+ /* We don't want a chunk larger than 10% of writable space */
max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
max_chunk_size);
@@ -4757,30 +5150,22 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
* this will have to be fixed for RAID1 and RAID10 over
* more drives
*/
- data_stripes = num_stripes / ncopies;
-
- if (type & BTRFS_BLOCK_GROUP_RAID5)
- data_stripes = num_stripes - 1;
-
- if (type & BTRFS_BLOCK_GROUP_RAID6)
- data_stripes = num_stripes - 2;
+ data_stripes = (num_stripes - nparity) / ncopies;
/*
* Use the number of data stripes to figure out how big this chunk
* is really going to be in terms of logical address space,
- * and compare that answer with the max chunk size
+ * and compare that answer with the max chunk size. If it's higher,
+ * we try to reduce stripe_size.
*/
if (stripe_size * data_stripes > max_chunk_size) {
- stripe_size = div_u64(max_chunk_size, data_stripes);
-
- /* bump the answer up to a 16MB boundary */
- stripe_size = round_up(stripe_size, SZ_16M);
-
/*
- * But don't go higher than the limits we found while searching
- * for free extents
+ * Reduce stripe_size, round it up to a 16MB boundary again and
+ * then use it, unless it ends up being even bigger than the
+ * previous value we had already.
*/
- stripe_size = min(devices_info[ndevs - 1].max_avail,
+ stripe_size = min(round_up(div_u64(max_chunk_size,
+ data_stripes), SZ_16M),
stripe_size);
}
@@ -4808,9 +5193,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
map->type = type;
map->sub_stripes = sub_stripes;
- num_bytes = stripe_size * data_stripes;
+ chunk_size = stripe_size * data_stripes;
- trace_btrfs_chunk_alloc(info, map, start, num_bytes);
+ trace_btrfs_chunk_alloc(info, map, start, chunk_size);
em = alloc_extent_map();
if (!em) {
@@ -4821,7 +5206,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
em->map_lookup = map;
em->start = start;
- em->len = num_bytes;
+ em->len = chunk_size;
em->block_start = 0;
em->block_len = em->len;
em->orig_block_len = stripe_size;
@@ -4839,14 +5224,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
refcount_inc(&em->refs);
write_unlock(&em_tree->lock);
- ret = btrfs_make_block_group(trans, 0, type, start, num_bytes);
+ ret = btrfs_make_block_group(trans, 0, type, start, chunk_size);
if (ret)
goto error_del_extent;
- for (i = 0; i < map->num_stripes; i++) {
- num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
- btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
- }
+ for (i = 0; i < map->num_stripes; i++)
+ btrfs_device_set_bytes_used(map->stripes[i].dev,
+ map->stripes[i].dev->bytes_used + stripe_size);
atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
@@ -4890,7 +5274,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
int i = 0;
int ret = 0;
- em = get_chunk_map(fs_info, chunk_offset, chunk_size);
+ em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
if (IS_ERR(em))
return PTR_ERR(em);
@@ -4971,10 +5355,10 @@ out:
}
/*
- * Chunk allocation falls into two parts. The first part does works
- * that make the new allocated chunk useable, but not do any operation
- * that modifies the chunk tree. The second part does the works that
- * require modifying the chunk tree. This division is important for the
+ * Chunk allocation falls into two parts. The first part does work
+ * that makes the new allocated chunk usable, but does not do any operation
+ * that modifies the chunk tree. The second part does the work that
+ * requires modifying the chunk tree. This division is important for the
* bootstrap process of adding storage to a seed btrfs.
*/
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
@@ -5032,7 +5416,7 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
int miss_ndevs = 0;
int i;
- em = get_chunk_map(fs_info, chunk_offset, 1);
+ em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
if (IS_ERR(em))
return 1;
@@ -5092,7 +5476,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
struct map_lookup *map;
int ret;
- em = get_chunk_map(fs_info, logical, len);
+ em = btrfs_get_chunk_map(fs_info, logical, len);
if (IS_ERR(em))
/*
* We could return errors for these cases, but that could get
@@ -5122,11 +5506,11 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
ret = 1;
free_extent_map(em);
- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
+ down_read(&fs_info->dev_replace.rwsem);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
fs_info->dev_replace.tgtdev)
ret++;
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+ up_read(&fs_info->dev_replace.rwsem);
return ret;
}
@@ -5138,7 +5522,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
struct map_lookup *map;
unsigned long len = fs_info->sectorsize;
- em = get_chunk_map(fs_info, logical, len);
+ em = btrfs_get_chunk_map(fs_info, logical, len);
if (!WARN_ON(IS_ERR(em))) {
map = em->map_lookup;
@@ -5155,7 +5539,7 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
struct map_lookup *map;
int ret = 0;
- em = get_chunk_map(fs_info, logical, len);
+ em = btrfs_get_chunk_map(fs_info, logical, len);
if(!WARN_ON(IS_ERR(em))) {
map = em->map_lookup;
@@ -5314,7 +5698,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
/* discard always return a bbio */
ASSERT(bbio_ret);
- em = get_chunk_map(fs_info, logical, length);
+ em = btrfs_get_chunk_map(fs_info, logical, length);
if (IS_ERR(em))
return PTR_ERR(em);
@@ -5640,7 +6024,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
return __btrfs_map_block_for_discard(fs_info, logical,
*length, bbio_ret);
- em = get_chunk_map(fs_info, logical, *length);
+ em = btrfs_get_chunk_map(fs_info, logical, *length);
if (IS_ERR(em))
return PTR_ERR(em);
@@ -5699,17 +6083,21 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
*length = em->len - offset;
}
- /* This is for when we're called from btrfs_merge_bio_hook() and all
- it cares about is the length */
+ /*
+ * This is for when we're called from btrfs_bio_fits_in_stripe and all
+ * it cares about is the length
+ */
if (!bbio_ret)
goto out;
- btrfs_dev_replace_read_lock(dev_replace);
+ down_read(&dev_replace->rwsem);
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+ /*
+ * Hold the semaphore for read during the whole operation, write is
+ * requested at commit time but must wait.
+ */
if (!dev_replace_is_ongoing)
- btrfs_dev_replace_read_unlock(dev_replace);
- else
- btrfs_dev_replace_set_lock_blocking(dev_replace);
+ up_read(&dev_replace->rwsem);
if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
!need_full_stripe(op) && dev_replace->tgtdev != NULL) {
@@ -5904,12 +6292,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
}
out:
if (dev_replace_is_ongoing) {
- ASSERT(atomic_read(&dev_replace->blocking_readers) > 0);
- btrfs_dev_replace_read_lock(dev_replace);
- /* Barrier implied by atomic_dec_and_test */
- if (atomic_dec_and_test(&dev_replace->blocking_readers))
- cond_wake_up_nomb(&dev_replace->read_lock_wq);
- btrfs_dev_replace_read_unlock(dev_replace);
+ lockdep_assert_held(&dev_replace->rwsem);
+ /* Unlock and let waiting writers proceed */
+ up_read(&dev_replace->rwsem);
}
free_extent_map(em);
return ret;
@@ -5943,7 +6328,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
u64 rmap_len;
int i, j, nr = 0;
- em = get_chunk_map(fs_info, chunk_start, 1);
+ em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
if (IS_ERR(em))
return -EIO;
@@ -6083,12 +6468,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device,
int should_queue = 1;
struct btrfs_pending_bios *pending_bios;
- if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) ||
- !device->bdev) {
- bio_io_error(bio);
- return;
- }
-
/* don't bother with additional async steps for reads, right now */
if (bio_op(bio) == REQ_OP_READ) {
btrfsic_submit_bio(bio);
@@ -6217,7 +6596,8 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
dev = bbio->stripes[dev_nr].dev;
- if (!dev || !dev->bdev ||
+ if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
+ &dev->dev_state) ||
(bio_op(first_bio) == REQ_OP_WRITE &&
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
bbio_error(bbio, first_bio, logical);
@@ -6245,7 +6625,7 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
cur_devices = fs_info->fs_devices;
while (cur_devices) {
if (!fsid ||
- !memcmp(cur_devices->fsid, fsid, BTRFS_FSID_SIZE)) {
+ !memcmp(cur_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
device = find_device(cur_devices, devid, uuid);
if (device)
return device;
@@ -6574,12 +6954,12 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
fs_devices = fs_devices->seed;
}
- fs_devices = find_fsid(fsid);
+ fs_devices = find_fsid(fsid, NULL);
if (!fs_devices) {
if (!btrfs_test_opt(fs_info, DEGRADED))
return ERR_PTR(-ENOENT);
- fs_devices = alloc_fs_devices(fsid);
+ fs_devices = alloc_fs_devices(fsid, NULL);
if (IS_ERR(fs_devices))
return fs_devices;
@@ -6629,7 +7009,7 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_FSID_SIZE);
- if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) {
+ if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
fs_devices = open_seed_devices(fs_info, fs_uuid);
if (IS_ERR(fs_devices))
return PTR_ERR(fs_devices);
@@ -6876,7 +7256,7 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
if (missing > max_tolerated) {
if (!failing_dev)
btrfs_warn(fs_info,
- "chunk %llu missing %d devices, max tolerance is %d for writeable mount",
+ "chunk %llu missing %d devices, max tolerance is %d for writable mount",
em->start, missing, max_tolerated);
free_extent_map(em);
ret = false;
@@ -7387,6 +7767,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
struct extent_map *em;
struct map_lookup *map;
+ struct btrfs_device *dev;
u64 stripe_len;
bool found = false;
int ret = 0;
@@ -7436,6 +7817,22 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
physical_offset, devid);
ret = -EUCLEAN;
}
+
+ /* Make sure no dev extent is beyond device bondary */
+ dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+ if (!dev) {
+ btrfs_err(fs_info, "failed to find devid %llu", devid);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ if (physical_offset + physical_len > dev->disk_total_bytes) {
+ btrfs_err(fs_info,
+"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
+ devid, physical_offset, physical_len,
+ dev->disk_total_bytes);
+ ret = -EUCLEAN;
+ goto out;
+ }
out:
free_extent_map(em);
return ret;
@@ -7478,6 +7875,8 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
struct btrfs_path *path;
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_key key;
+ u64 prev_devid = 0;
+ u64 prev_dev_ext_end = 0;
int ret = 0;
key.objectid = 1;
@@ -7522,10 +7921,22 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
physical_len = btrfs_dev_extent_length(leaf, dext);
+ /* Check if this dev extent overlaps with the previous one */
+ if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
+ btrfs_err(fs_info,
+"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
+ devid, physical_offset, prev_dev_ext_end);
+ ret = -EUCLEAN;
+ goto out;
+ }
+
ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
physical_offset, physical_len);
if (ret < 0)
goto out;
+ prev_devid = devid;
+ prev_dev_ext_end = physical_offset + physical_len;
+
ret = btrfs_next_item(root, path);
if (ret < 0)
goto out;
@@ -7541,3 +7952,27 @@ out:
btrfs_free_path(path);
return ret;
}
+
+/*
+ * Check whether the given block group or device is pinned by any inode being
+ * used as a swapfile.
+ */
+bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
+{
+ struct btrfs_swapfile_pin *sp;
+ struct rb_node *node;
+
+ spin_lock(&fs_info->swapfile_pins_lock);
+ node = fs_info->swapfile_pins.rb_node;
+ while (node) {
+ sp = rb_entry(node, struct btrfs_swapfile_pin, node);
+ if (ptr < sp->ptr)
+ node = node->rb_left;
+ else if (ptr > sp->ptr)
+ node = node->rb_right;
+ else
+ break;
+ }
+ spin_unlock(&fs_info->swapfile_pins_lock);
+ return node != NULL;
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index aefce89..ed80664 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -210,6 +210,8 @@ BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
struct btrfs_fs_devices {
u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+ u8 metadata_uuid[BTRFS_FSID_SIZE];
+ bool fsid_change;
struct list_head fs_list;
u64 num_devices;
@@ -218,6 +220,10 @@ struct btrfs_fs_devices {
u64 missing_devices;
u64 total_rw_bytes;
u64 total_devices;
+
+ /* Highest generation number of seen devices */
+ u64 latest_generation;
+
struct block_device *latest_bdev;
/* all of the devices in the FS, protected by a mutex
@@ -261,15 +267,12 @@ struct btrfs_fs_devices {
* we allocate are actually btrfs_io_bios. We'll cram as much of
* struct btrfs_bio as we can into this over time.
*/
-typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err);
struct btrfs_io_bio {
unsigned int mirror_num;
unsigned int stripe_index;
u64 logical;
u8 *csum;
u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
- u8 *csum_allocated;
- btrfs_io_bio_end_io_t *end_io;
struct bvec_iter iter;
/*
* This member must come last, bio_alloc_bioset will allocate enough
@@ -283,15 +286,20 @@ static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio)
return container_of(bio, struct btrfs_io_bio, bio);
}
+static inline void btrfs_io_bio_free_csum(struct btrfs_io_bio *io_bio)
+{
+ if (io_bio->csum != io_bio->csum_inline) {
+ kfree(io_bio->csum);
+ io_bio->csum = NULL;
+ }
+}
+
struct btrfs_bio_stripe {
struct btrfs_device *dev;
u64 physical;
u64 length; /* only used for discard mappings */
};
-struct btrfs_bio;
-typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
-
struct btrfs_bio {
refcount_t refs;
atomic_t stripes_pending;
@@ -331,6 +339,8 @@ struct btrfs_raid_attr {
int tolerated_failures; /* max tolerated fail devs */
int devs_increment; /* ndevs has to be a multiple of this */
int ncopies; /* how many copies to data has */
+ int nparity; /* number of stripes worth of bytes to store
+ * parity information */
int mindev_error; /* error code if min devs requisite is unmet */
const char raid_name[8]; /* name of the raid */
u64 bg_flag; /* block group flag of the raid */
@@ -430,6 +440,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);
int btrfs_balance(struct btrfs_fs_info *fs_info,
struct btrfs_balance_control *bctl,
struct btrfs_ioctl_balance_args *bargs);
+void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf);
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
@@ -462,6 +473,8 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
u64 chunk_offset, u64 chunk_size);
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
+struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length);
static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
int index)
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index ea78c3d..f141b45 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -11,6 +11,7 @@
#include <linux/security.h>
#include <linux/posix_acl_xattr.h>
#include <linux/iversion.h>
+#include <linux/sched/mm.h>
#include "ctree.h"
#include "btrfs_inode.h"
#include "transaction.h"
@@ -422,9 +423,15 @@ static int btrfs_initxattrs(struct inode *inode,
{
const struct xattr *xattr;
struct btrfs_trans_handle *trans = fs_info;
+ unsigned int nofs_flag;
char *name;
int err = 0;
+ /*
+ * We're holding a transaction handle, so use a NOFS memory allocation
+ * context to avoid deadlock if reclaim happens.
+ */
+ nofs_flag = memalloc_nofs_save();
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
strlen(xattr->name) + 1, GFP_KERNEL);
@@ -440,6 +447,7 @@ static int btrfs_initxattrs(struct inode *inode,
if (err < 0)
break;
}
+ memalloc_nofs_restore(nofs_flag);
return err;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 1286c2b..52d024b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2366,7 +2366,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
balance_dirty_pages_ratelimited(mapping);
- if (unlikely(fatal_signal_pending(current))) {
+ if (fatal_signal_pending(current)) {
err = -EINTR;
goto out;
}
@@ -3060,11 +3060,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
*/
bio = bio_alloc(GFP_NOIO, 1);
- if (wbc) {
- wbc_init_bio(wbc, bio);
- wbc_account_io(wbc, bh->b_page, bh->b_size);
- }
-
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio_set_dev(bio, bh->b_bdev);
bio->bi_write_hint = write_hint;
@@ -3084,6 +3079,11 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
op_flags |= REQ_PRIO;
bio_set_op_attrs(bio, op, op_flags);
+ if (wbc) {
+ wbc_init_bio(wbc, bio);
+ wbc_account_io(wbc, bh->b_page, bh->b_size);
+ }
+
submit_bio(bio);
return 0;
}
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 95983c7..1645fcf 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -244,11 +244,13 @@ wait_for_old_object:
ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags));
- cache->cache.ops->put_object(&xobject->fscache, cachefiles_obj_put_wait_retry);
+ cache->cache.ops->put_object(&xobject->fscache,
+ (enum fscache_obj_ref_trace)cachefiles_obj_put_wait_retry);
goto try_again;
requeue:
- cache->cache.ops->put_object(&xobject->fscache, cachefiles_obj_put_wait_timeo);
+ cache->cache.ops->put_object(&xobject->fscache,
+ (enum fscache_obj_ref_trace)cachefiles_obj_put_wait_timeo);
_leave(" = -ETIMEDOUT");
return -ETIMEDOUT;
}
@@ -336,7 +338,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
try_again:
/* first step is to make up a grave dentry in the graveyard */
sprintf(nbuffer, "%08x%08x",
- (uint32_t) get_seconds(),
+ (uint32_t) ktime_get_real_seconds(),
(uint32_t) atomic_inc_return(&cache->gravecounter));
/* do the multiway lock magic */
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 40f7595..8a57740 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -535,7 +535,10 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
netpage->index, cachefiles_gfp);
if (ret < 0) {
if (ret == -EEXIST) {
+ put_page(backpage);
+ backpage = NULL;
put_page(netpage);
+ netpage = NULL;
fscache_retrieval_complete(op, 1);
continue;
}
@@ -608,7 +611,10 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
netpage->index, cachefiles_gfp);
if (ret < 0) {
if (ret == -EEXIST) {
+ put_page(backpage);
+ backpage = NULL;
put_page(netpage);
+ netpage = NULL;
fscache_retrieval_complete(op, 1);
continue;
}
@@ -962,11 +968,8 @@ void cachefiles_uncache_page(struct fscache_object *_object, struct page *page)
__releases(&object->fscache.cookie->lock)
{
struct cachefiles_object *object;
- struct cachefiles_cache *cache;
object = container_of(_object, struct cachefiles_object, fscache);
- cache = container_of(object->fscache.cache,
- struct cachefiles_cache, cache);
_enter("%p,{%lu}", object, page->index);
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 0a29a00..511e6c6 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -135,7 +135,8 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object,
struct dentry *dentry = object->dentry;
int ret;
- ASSERT(dentry);
+ if (!dentry)
+ return -ESTALE;
_enter("%p,#%d", object, auxdata->len);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 8eade7a..5d0c05e 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -306,7 +306,7 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
struct ceph_osd_client *osdc =
&ceph_inode_to_client(inode)->client->osdc;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct page *page = list_entry(page_list->prev, struct page, lru);
+ struct page *page = lru_to_page(page_list);
struct ceph_vino vino;
struct ceph_osd_request *req;
u64 off;
@@ -333,8 +333,7 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
if (got)
ceph_put_cap_refs(ci, got);
while (!list_empty(page_list)) {
- page = list_entry(page_list->prev,
- struct page, lru);
+ page = lru_to_page(page_list);
list_del(&page->lru);
put_page(page);
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index f3496db..94c026b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -657,6 +657,9 @@ void ceph_add_cap(struct inode *inode,
session->s_nr_caps++;
spin_unlock(&session->s_cap_lock);
} else {
+ if (cap->cap_gen < session->s_cap_gen)
+ cap->issued = cap->implemented = CEPH_CAP_PIN;
+
/*
* auth mds of the inode changed. we received the cap export
* message, but still haven't received the cap import message.
@@ -1855,14 +1858,17 @@ retry_locked:
retain |= CEPH_CAP_ANY; /* be greedy */
} else if (S_ISDIR(inode->i_mode) &&
(issued & CEPH_CAP_FILE_SHARED) &&
- __ceph_dir_is_complete(ci)) {
+ __ceph_dir_is_complete(ci)) {
/*
* If a directory is complete, we want to keep
* the exclusive cap. So that MDS does not end up
* revoking the shared cap on every create/unlink
* operation.
*/
- want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
+ if (IS_RDONLY(inode))
+ want = CEPH_CAP_ANY_SHARED;
+ else
+ want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
retain |= want;
} else {
@@ -1970,8 +1976,7 @@ retry_locked:
goto ack;
/* things we might delay */
- if ((cap->issued & ~retain) == 0 &&
- cap->mds_wanted == want)
+ if ((cap->issued & ~retain) == 0)
continue; /* nope, all good */
if (no_delay)
@@ -3048,7 +3053,8 @@ static void handle_cap_grant(struct inode *inode,
int used, wanted, dirty;
u64 size = le64_to_cpu(grant->size);
u64 max_size = le64_to_cpu(grant->max_size);
- int check_caps = 0;
+ unsigned char check_caps = 0;
+ bool was_stale = cap->cap_gen < session->s_cap_gen;
bool wake = false;
bool writeback = false;
bool queue_trunc = false;
@@ -3063,21 +3069,6 @@ static void handle_cap_grant(struct inode *inode,
/*
- * auth mds of the inode changed. we received the cap export message,
- * but still haven't received the cap import message. handle_cap_export
- * updated the new auth MDS' cap.
- *
- * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message
- * that was sent before the cap import message. So don't remove caps.
- */
- if (ceph_seq_cmp(seq, cap->seq) <= 0) {
- WARN_ON(cap != ci->i_auth_cap);
- WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id));
- seq = cap->seq;
- newcaps |= cap->issued;
- }
-
- /*
* If CACHE is being revoked, and we have no dirty buffers,
* try to invalidate (once). (If there are dirty buffers, we
* will invalidate _after_ writeback.)
@@ -3096,6 +3087,24 @@ static void handle_cap_grant(struct inode *inode,
}
}
+ if (was_stale)
+ cap->issued = cap->implemented = CEPH_CAP_PIN;
+
+ /*
+ * auth mds of the inode changed. we received the cap export message,
+ * but still haven't received the cap import message. handle_cap_export
+ * updated the new auth MDS' cap.
+ *
+ * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message
+ * that was sent before the cap import message. So don't remove caps.
+ */
+ if (ceph_seq_cmp(seq, cap->seq) <= 0) {
+ WARN_ON(cap != ci->i_auth_cap);
+ WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id));
+ seq = cap->seq;
+ newcaps |= cap->issued;
+ }
+
/* side effects now are allowed */
cap->cap_gen = session->s_cap_gen;
cap->seq = seq;
@@ -3200,13 +3209,20 @@ static void handle_cap_grant(struct inode *inode,
ceph_cap_string(wanted),
ceph_cap_string(used),
ceph_cap_string(dirty));
- if (wanted != le32_to_cpu(grant->wanted)) {
- dout("mds wanted %s -> %s\n",
- ceph_cap_string(le32_to_cpu(grant->wanted)),
- ceph_cap_string(wanted));
- /* imported cap may not have correct mds_wanted */
- if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT)
- check_caps = 1;
+
+ if ((was_stale || le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) &&
+ (wanted & ~(cap->mds_wanted | newcaps))) {
+ /*
+ * If mds is importing cap, prior cap messages that update
+ * 'wanted' may get dropped by mds (migrate seq mismatch).
+ *
+ * We don't send cap message to update 'wanted' if what we
+ * want are already issued. If mds revokes caps, cap message
+ * that releases caps also tells mds what we want. But if
+ * caps got revoked by mds forcedly (session stale). We may
+ * haven't told mds what we want.
+ */
+ check_caps = 1;
}
/* revocation, grant, or no-op? */
@@ -3539,9 +3555,9 @@ retry:
goto out_unlock;
if (target < 0) {
- __ceph_remove_cap(cap, false);
- if (!ci->i_auth_cap)
+ if (cap->mds_wanted | cap->issued)
ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
+ __ceph_remove_cap(cap, false);
goto out_unlock;
}
@@ -3569,7 +3585,6 @@ retry:
tcap->cap_id = t_cap_id;
tcap->seq = t_seq - 1;
tcap->issue_seq = t_seq - 1;
- tcap->mseq = t_mseq;
tcap->issued |= issued;
tcap->implemented |= issued;
if (cap == ci->i_auth_cap)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 79dd5e6..9d1f34d 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1098,8 +1098,9 @@ out_unlock:
* splice a dentry to an inode.
* caller must hold directory i_mutex for this to be safe.
*/
-static struct dentry *splice_dentry(struct dentry *dn, struct inode *in)
+static int splice_dentry(struct dentry **pdn, struct inode *in)
{
+ struct dentry *dn = *pdn;
struct dentry *realdn;
BUG_ON(d_inode(dn));
@@ -1132,28 +1133,23 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in)
if (IS_ERR(realdn)) {
pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
PTR_ERR(realdn), dn, in, ceph_vinop(in));
- dn = realdn;
- /*
- * Caller should release 'dn' in the case of error.
- * If 'req->r_dentry' is passed to this function,
- * caller should leave 'req->r_dentry' untouched.
- */
- goto out;
- } else if (realdn) {
+ return PTR_ERR(realdn);
+ }
+
+ if (realdn) {
dout("dn %p (%d) spliced with %p (%d) "
"inode %p ino %llx.%llx\n",
dn, d_count(dn),
realdn, d_count(realdn),
d_inode(realdn), ceph_vinop(d_inode(realdn)));
dput(dn);
- dn = realdn;
+ *pdn = realdn;
} else {
BUG_ON(!ceph_dentry(dn));
dout("dn %p attached to %p ino %llx.%llx\n",
dn, d_inode(dn), ceph_vinop(d_inode(dn)));
}
-out:
- return dn;
+ return 0;
}
/*
@@ -1340,7 +1336,12 @@ retry_lookup:
dout("dn %p gets new offset %lld\n", req->r_old_dentry,
ceph_dentry(req->r_old_dentry)->offset);
- dn = req->r_old_dentry; /* use old_dentry */
+ /* swap r_dentry and r_old_dentry in case that
+ * splice_dentry() gets called later. This is safe
+ * because no other place will use them */
+ req->r_dentry = req->r_old_dentry;
+ req->r_old_dentry = dn;
+ dn = req->r_dentry;
}
/* null dentry? */
@@ -1365,12 +1366,10 @@ retry_lookup:
if (d_really_is_negative(dn)) {
ceph_dir_clear_ordered(dir);
ihold(in);
- dn = splice_dentry(dn, in);
- if (IS_ERR(dn)) {
- err = PTR_ERR(dn);
+ err = splice_dentry(&req->r_dentry, in);
+ if (err < 0)
goto done;
- }
- req->r_dentry = dn; /* may have spliced */
+ dn = req->r_dentry; /* may have spliced */
} else if (d_really_is_positive(dn) && d_inode(dn) != in) {
dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
dn, d_inode(dn), ceph_vinop(d_inode(dn)),
@@ -1390,22 +1389,18 @@ retry_lookup:
} else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
req->r_op == CEPH_MDS_OP_MKSNAP) &&
!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
- struct dentry *dn = req->r_dentry;
struct inode *dir = req->r_parent;
/* fill out a snapdir LOOKUPSNAP dentry */
- BUG_ON(!dn);
BUG_ON(!dir);
BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);
- dout(" linking snapped dir %p to dn %p\n", in, dn);
+ BUG_ON(!req->r_dentry);
+ dout(" linking snapped dir %p to dn %p\n", in, req->r_dentry);
ceph_dir_clear_ordered(dir);
ihold(in);
- dn = splice_dentry(dn, in);
- if (IS_ERR(dn)) {
- err = PTR_ERR(dn);
+ err = splice_dentry(&req->r_dentry, in);
+ if (err < 0)
goto done;
- }
- req->r_dentry = dn; /* may have spliced */
} else if (rinfo->head->is_dentry) {
struct ceph_vino *ptvino = NULL;
@@ -1669,8 +1664,6 @@ retry_lookup:
}
if (d_really_is_negative(dn)) {
- struct dentry *realdn;
-
if (ceph_security_xattr_deadlock(in)) {
dout(" skip splicing dn %p to inode %p"
" (security xattr deadlock)\n", dn, in);
@@ -1679,13 +1672,9 @@ retry_lookup:
goto next_item;
}
- realdn = splice_dentry(dn, in);
- if (IS_ERR(realdn)) {
- err = PTR_ERR(realdn);
- d_drop(dn);
+ err = splice_dentry(&dn, in);
+ if (err < 0)
goto next_item;
- }
- dn = realdn;
}
ceph_dentry(dn)->offset = rde->offset;
@@ -1701,8 +1690,7 @@ retry_lookup:
err = ret;
}
next_item:
- if (dn)
- dput(dn);
+ dput(dn);
}
out:
if (err == 0 && skipped == 0) {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index bd13a32..163fc74 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1232,13 +1232,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
dout("removing cap %p, ci is %p, inode is %p\n",
cap, ci, &ci->vfs_inode);
spin_lock(&ci->i_ceph_lock);
+ if (cap->mds_wanted | cap->issued)
+ ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
__ceph_remove_cap(cap, false);
if (!ci->i_auth_cap) {
struct ceph_cap_flush *cf;
struct ceph_mds_client *mdsc = fsc->mdsc;
- ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
-
if (ci->i_wrbuffer_ref > 0 &&
READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
invalidate = true;
@@ -1355,6 +1355,12 @@ static void remove_session_caps(struct ceph_mds_session *session)
dispose_cap_releases(session->s_mdsc, &dispose);
}
+enum {
+ RECONNECT,
+ RENEWCAPS,
+ FORCE_RO,
+};
+
/*
* wake up any threads waiting on this session's caps. if the cap is
* old (didn't get renewed on the client reconnect), remove it now.
@@ -1365,23 +1371,34 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
void *arg)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ unsigned long ev = (unsigned long)arg;
- if (arg) {
+ if (ev == RECONNECT) {
spin_lock(&ci->i_ceph_lock);
ci->i_wanted_max_size = 0;
ci->i_requested_max_size = 0;
spin_unlock(&ci->i_ceph_lock);
+ } else if (ev == RENEWCAPS) {
+ if (cap->cap_gen < cap->session->s_cap_gen) {
+ /* mds did not re-issue stale cap */
+ spin_lock(&ci->i_ceph_lock);
+ cap->issued = cap->implemented = CEPH_CAP_PIN;
+ /* make sure mds knows what we want */
+ if (__ceph_caps_file_wanted(ci) & ~cap->mds_wanted)
+ ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
+ spin_unlock(&ci->i_ceph_lock);
+ }
+ } else if (ev == FORCE_RO) {
}
wake_up_all(&ci->i_cap_wq);
return 0;
}
-static void wake_up_session_caps(struct ceph_mds_session *session,
- int reconnect)
+static void wake_up_session_caps(struct ceph_mds_session *session, int ev)
{
dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
iterate_session_caps(session, wake_up_session_cb,
- (void *)(unsigned long)reconnect);
+ (void *)(unsigned long)ev);
}
/*
@@ -1466,7 +1483,7 @@ static void renewed_caps(struct ceph_mds_client *mdsc,
spin_unlock(&session->s_cap_lock);
if (wake)
- wake_up_session_caps(session, 0);
+ wake_up_session_caps(session, RENEWCAPS);
}
/*
@@ -2847,7 +2864,7 @@ static void handle_session(struct ceph_mds_session *session,
spin_lock(&session->s_cap_lock);
session->s_readonly = true;
spin_unlock(&session->s_cap_lock);
- wake_up_session_caps(session, 0);
+ wake_up_session_caps(session, FORCE_RO);
break;
case CEPH_SESSION_REJECT:
@@ -2943,11 +2960,8 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
struct ceph_inode_info *ci = cap->ci;
struct ceph_reconnect_state *recon_state = arg;
struct ceph_pagelist *pagelist = recon_state->pagelist;
- char *path;
- int pathlen, err;
- u64 pathbase;
+ int err;
u64 snap_follows;
- struct dentry *dentry;
dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
inode, ceph_vinop(inode), cap, cap->cap_id,
@@ -2956,19 +2970,6 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
if (err)
return err;
- dentry = d_find_alias(inode);
- if (dentry) {
- path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
- goto out_dput;
- }
- } else {
- path = NULL;
- pathlen = 0;
- pathbase = 0;
- }
-
spin_lock(&ci->i_ceph_lock);
cap->seq = 0; /* reset cap seq */
cap->issue_seq = 0; /* and issue_seq */
@@ -2980,7 +2981,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
rec.v2.issued = cpu_to_le32(cap->issued);
rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- rec.v2.pathbase = cpu_to_le64(pathbase);
+ rec.v2.pathbase = 0;
rec.v2.flock_len = (__force __le32)
((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
} else {
@@ -2991,7 +2992,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- rec.v1.pathbase = cpu_to_le64(pathbase);
+ rec.v1.pathbase = 0;
}
if (list_empty(&ci->i_cap_snaps)) {
@@ -3023,7 +3024,7 @@ encode_again:
GFP_NOFS);
if (!flocks) {
err = -ENOMEM;
- goto out_free;
+ goto out_err;
}
err = ceph_encode_locks_to_buffer(inode, flocks,
num_fcntl_locks,
@@ -3033,7 +3034,7 @@ encode_again:
flocks = NULL;
if (err == -ENOSPC)
goto encode_again;
- goto out_free;
+ goto out_err;
}
} else {
kfree(flocks);
@@ -3053,44 +3054,64 @@ encode_again:
sizeof(struct ceph_filelock);
rec.v2.flock_len = cpu_to_le32(struct_len);
- struct_len += sizeof(rec.v2);
- struct_len += sizeof(u32) + pathlen;
+ struct_len += sizeof(u32) + sizeof(rec.v2);
if (struct_v >= 2)
struct_len += sizeof(u64); /* snap_follows */
total_len += struct_len;
err = ceph_pagelist_reserve(pagelist, total_len);
+ if (err) {
+ kfree(flocks);
+ goto out_err;
+ }
- if (!err) {
- if (recon_state->msg_version >= 3) {
- ceph_pagelist_encode_8(pagelist, struct_v);
- ceph_pagelist_encode_8(pagelist, 1);
- ceph_pagelist_encode_32(pagelist, struct_len);
- }
- ceph_pagelist_encode_string(pagelist, path, pathlen);
- ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
- ceph_locks_to_pagelist(flocks, pagelist,
- num_fcntl_locks,
- num_flock_locks);
- if (struct_v >= 2)
- ceph_pagelist_encode_64(pagelist, snap_follows);
+ if (recon_state->msg_version >= 3) {
+ ceph_pagelist_encode_8(pagelist, struct_v);
+ ceph_pagelist_encode_8(pagelist, 1);
+ ceph_pagelist_encode_32(pagelist, struct_len);
}
+ ceph_pagelist_encode_string(pagelist, NULL, 0);
+ ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
+ ceph_locks_to_pagelist(flocks, pagelist,
+ num_fcntl_locks, num_flock_locks);
+ if (struct_v >= 2)
+ ceph_pagelist_encode_64(pagelist, snap_follows);
+
kfree(flocks);
} else {
- size_t size = sizeof(u32) + pathlen + sizeof(rec.v1);
- err = ceph_pagelist_reserve(pagelist, size);
- if (!err) {
- ceph_pagelist_encode_string(pagelist, path, pathlen);
- ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
+ u64 pathbase = 0;
+ int pathlen = 0;
+ char *path = NULL;
+ struct dentry *dentry;
+
+ dentry = d_find_alias(inode);
+ if (dentry) {
+ path = ceph_mdsc_build_path(dentry,
+ &pathlen, &pathbase, 0);
+ dput(dentry);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ goto out_err;
+ }
+ rec.v1.pathbase = cpu_to_le64(pathbase);
}
+
+ err = ceph_pagelist_reserve(pagelist,
+ pathlen + sizeof(u32) + sizeof(rec.v1));
+ if (err) {
+ kfree(path);
+ goto out_err;
+ }
+
+ ceph_pagelist_encode_string(pagelist, path, pathlen);
+ ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
+
+ kfree(path);
}
recon_state->nr_caps++;
-out_free:
- kfree(path);
-out_dput:
- dput(dentry);
+out_err:
return err;
}
@@ -3339,7 +3360,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
pr_info("mds%d recovery completed\n", s->s_mds);
kick_requests(mdsc, i);
ceph_kick_flushing_caps(mdsc, s);
- wake_up_session_caps(s, 1);
+ wake_up_session_caps(s, RECONNECT);
}
}
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 32fcce0..729da15 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -17,14 +17,16 @@
#include <linux/ceph/auth.h>
/* The first 8 bits are reserved for old ceph releases */
-#define CEPHFS_FEATURE_MIMIC 8
-
-#define CEPHFS_FEATURES_ALL { \
- 0, 1, 2, 3, 4, 5, 6, 7, \
- CEPHFS_FEATURE_MIMIC, \
+#define CEPHFS_FEATURE_MIMIC 8
+#define CEPHFS_FEATURE_REPLY_ENCODING 9
+#define CEPHFS_FEATURE_RECLAIM_CLIENT 10
+#define CEPHFS_FEATURE_LAZY_CAP_WANTED 11
+
+#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
+ 0, 1, 2, 3, 4, 5, 6, 7, \
+ CEPHFS_FEATURE_MIMIC, \
+ CEPHFS_FEATURE_LAZY_CAP_WANTED, \
}
-
-#define CEPHFS_FEATURES_CLIENT_SUPPORTED CEPHFS_FEATURES_ALL
#define CEPHFS_FEATURES_CLIENT_REQUIRED {}
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 44e53ab..1a2c5d3 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -35,7 +35,6 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
/* pick */
n = prandom_u32() % n;
- i = 0;
for (i = 0; n > 0; i++, n--)
while (m->m_info[i].state <= 0)
i++;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index b5ecd6f..4e9a7cc 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -563,8 +563,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",noacl");
#endif
- if (fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM)
- seq_puts(m, ",nocopyfrom");
+ if ((fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM) == 0)
+ seq_puts(m, ",copyfrom");
if (fsopt->mds_namespace)
seq_show_option(m, "mds_namespace", fsopt->mds_namespace);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index c005a54..dfb64a5 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -42,7 +42,9 @@
#define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */
#define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */
-#define CEPH_MOUNT_OPT_DEFAULT CEPH_MOUNT_OPT_DCACHE
+#define CEPH_MOUNT_OPT_DEFAULT \
+ (CEPH_MOUNT_OPT_DCACHE | \
+ CEPH_MOUNT_OPT_NOCOPYFROM)
#define ceph_set_mount_opt(fsc, opt) \
(fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
@@ -808,7 +810,7 @@ static inline int default_congestion_kb(void)
* This allows larger machines to have larger/more transfers.
* Limit the default to 256M
*/
- congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+ congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10);
if (congestion_kb > 256*1024)
congestion_kb = 256*1024;
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index abcd78e..f1ddc9d 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -133,7 +133,7 @@ config CIFS_XATTR
config CIFS_POSIX
bool "CIFS POSIX Extensions"
- depends on CIFS_XATTR
+ depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR
help
Enabling this option will cause the cifs client to attempt to
negotiate a newer dialect with servers, such as Samba 3.0.5
@@ -190,8 +190,9 @@ config CIFS_DFS_UPCALL
moves to a different server. This feature also enables
an upcall mechanism for CIFS which contacts userspace helper
utilities to provide server name resolution (host names to
- IP addresses) which is needed for implicit mounts of DFS junction
- points. If unsure, say Y.
+ IP addresses) which is needed in order to reconnect to
+ servers if their addresses change or for implicit mounts of
+ DFS junction points. If unsure, say Y.
config CIFS_NFSD_EXPORT
bool "Allow nfsd to export CIFS file system"
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 8581799..51af69a 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -17,7 +17,7 @@ cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
-cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o
+cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o dfs_cache.o
cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index ba178b09..593fb42 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -30,6 +30,9 @@
#include "cifsproto.h"
#include "cifs_debug.h"
#include "cifsfs.h"
+#ifdef CONFIG_CIFS_DFS_UPCALL
+#include "dfs_cache.h"
+#endif
#ifdef CONFIG_CIFS_SMB_DIRECT
#include "smbdirect.h"
#endif
@@ -629,6 +632,11 @@ cifs_proc_init(void)
&cifs_security_flags_proc_fops);
proc_create("LookupCacheEnabled", 0644, proc_fs_cifs,
&cifs_lookup_cache_proc_fops);
+
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ proc_create("dfscache", 0644, proc_fs_cifs, &dfscache_proc_fops);
+#endif
+
#ifdef CONFIG_CIFS_SMB_DIRECT
proc_create("rdma_readwrite_threshold", 0644, proc_fs_cifs,
&cifs_rdma_readwrite_threshold_proc_fops);
@@ -663,6 +671,10 @@ cifs_proc_clean(void)
remove_proc_entry("SecurityFlags", proc_fs_cifs);
remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs);
remove_proc_entry("LookupCacheEnabled", proc_fs_cifs);
+
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ remove_proc_entry("dfscache", proc_fs_cifs);
+#endif
#ifdef CONFIG_CIFS_SMB_DIRECT
remove_proc_entry("rdma_readwrite_threshold", proc_fs_cifs);
remove_proc_entry("smbd_max_frmr_depth", proc_fs_cifs);
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index b97c74e..d9b99ab 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -25,6 +25,7 @@
#include "dns_resolve.h"
#include "cifs_debug.h"
#include "cifs_unicode.h"
+#include "dfs_cache.h"
static LIST_HEAD(cifs_dfs_automount_list);
@@ -126,7 +127,7 @@ cifs_build_devname(char *nodename, const char *prepath)
* @sb_mountdata: parent/root DFS mount options (template)
* @fullpath: full path in UNC format
* @ref: server's referral
- * @devname: pointer for saving device name
+ * @devname: optional pointer for saving device name
*
* creates mount options for submount based on template options sb_mountdata
* and replacing unc,ip,prefixpath options with ones we've got form ref_unc.
@@ -140,6 +141,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
char **devname)
{
int rc;
+ char *name;
char *mountdata = NULL;
const char *prepath = NULL;
int md_len;
@@ -158,17 +160,17 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
prepath++;
}
- *devname = cifs_build_devname(ref->node_name, prepath);
- if (IS_ERR(*devname)) {
- rc = PTR_ERR(*devname);
- *devname = NULL;
+ name = cifs_build_devname(ref->node_name, prepath);
+ if (IS_ERR(name)) {
+ rc = PTR_ERR(name);
+ name = NULL;
goto compose_mount_options_err;
}
- rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
+ rc = dns_resolve_server_name_to_ip(name, &srvIP);
if (rc < 0) {
cifs_dbg(FYI, "%s: Failed to resolve server part of %s to IP: %d\n",
- __func__, *devname, rc);
+ __func__, name, rc);
goto compose_mount_options_err;
}
@@ -224,6 +226,9 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
strcat(mountdata, "ip=");
strcat(mountdata, srvIP);
+ if (devname)
+ *devname = name;
+
/*cifs_dbg(FYI, "%s: parent mountdata: %s\n", __func__, sb_mountdata);*/
/*cifs_dbg(FYI, "%s: submount mountdata: %s\n", __func__, mountdata );*/
@@ -234,8 +239,7 @@ compose_mount_options_out:
compose_mount_options_err:
kfree(mountdata);
mountdata = ERR_PTR(rc);
- kfree(*devname);
- *devname = NULL;
+ kfree(name);
goto compose_mount_options_out;
}
@@ -251,20 +255,30 @@ static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt,
{
struct vfsmount *mnt;
char *mountdata;
- char *devname = NULL;
+ char *devname;
+
+ /*
+ * Always pass down the DFS full path to smb3_do_mount() so we
+ * can use it later for failover.
+ */
+ devname = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL);
+ if (!devname)
+ return ERR_PTR(-ENOMEM);
+
+ convert_delimiter(devname, '/');
/* strip first '\' from fullpath */
mountdata = cifs_compose_mount_options(cifs_sb->mountdata,
- fullpath + 1, ref, &devname);
-
- if (IS_ERR(mountdata))
+ fullpath + 1, ref, NULL);
+ if (IS_ERR(mountdata)) {
+ kfree(devname);
return (struct vfsmount *)mountdata;
+ }
mnt = vfs_submount(mntpt, &cifs_fs_type, devname, mountdata);
kfree(mountdata);
kfree(devname);
return mnt;
-
}
static void dump_referral(const struct dfs_info3_param *ref)
@@ -282,16 +296,15 @@ static void dump_referral(const struct dfs_info3_param *ref)
*/
static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
{
- struct dfs_info3_param *referrals = NULL;
- unsigned int num_referrals = 0;
+ struct dfs_info3_param referral = {0};
struct cifs_sb_info *cifs_sb;
struct cifs_ses *ses;
- char *full_path;
+ struct cifs_tcon *tcon;
+ char *full_path, *root_path;
unsigned int xid;
- int i;
+ int len;
int rc;
struct vfsmount *mnt;
- struct tcon_link *tlink;
cifs_dbg(FYI, "in %s\n", __func__);
BUG_ON(IS_ROOT(mntpt));
@@ -315,48 +328,69 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
if (full_path == NULL)
goto cdda_exit;
- tlink = cifs_sb_tlink(cifs_sb);
- if (IS_ERR(tlink)) {
- mnt = ERR_CAST(tlink);
+ cifs_dbg(FYI, "%s: full_path: %s\n", __func__, full_path);
+
+ if (!cifs_sb_master_tlink(cifs_sb)) {
+ cifs_dbg(FYI, "%s: master tlink is NULL\n", __func__);
goto free_full_path;
}
- ses = tlink_tcon(tlink)->ses;
+ tcon = cifs_sb_master_tcon(cifs_sb);
+ if (!tcon) {
+ cifs_dbg(FYI, "%s: master tcon is NULL\n", __func__);
+ goto free_full_path;
+ }
+
+ root_path = kstrdup(tcon->treeName, GFP_KERNEL);
+ if (!root_path) {
+ mnt = ERR_PTR(-ENOMEM);
+ goto free_full_path;
+ }
+ cifs_dbg(FYI, "%s: root path: %s\n", __func__, root_path);
+
+ ses = tcon->ses;
xid = get_xid();
- rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls,
- &num_referrals, &referrals,
- cifs_remap(cifs_sb));
- free_xid(xid);
- cifs_put_tlink(tlink);
-
- mnt = ERR_PTR(-ENOENT);
- for (i = 0; i < num_referrals; i++) {
- int len;
- dump_referral(referrals + i);
- /* connect to a node */
- len = strlen(referrals[i].node_name);
- if (len < 2) {
- cifs_dbg(VFS, "%s: Net Address path too short: %s\n",
- __func__, referrals[i].node_name);
- mnt = ERR_PTR(-EINVAL);
- break;
- }
- mnt = cifs_dfs_do_refmount(mntpt, cifs_sb,
- full_path, referrals + i);
- cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n",
- __func__, referrals[i].node_name, mnt);
- if (!IS_ERR(mnt))
- goto success;
+ /*
+ * If DFS root has been expired, then unconditionally fetch it again to
+ * refresh DFS referral cache.
+ */
+ rc = dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb),
+ root_path + 1, NULL, NULL);
+ if (!rc) {
+ rc = dfs_cache_find(xid, ses, cifs_sb->local_nls,
+ cifs_remap(cifs_sb), full_path + 1,
+ &referral, NULL);
}
- /* no valid submounts were found; return error from get_dfs_path() by
- * preference */
- if (rc != 0)
+ free_xid(xid);
+
+ if (rc) {
mnt = ERR_PTR(rc);
+ goto free_root_path;
+ }
+
+ dump_referral(&referral);
-success:
- free_dfs_info_array(referrals, num_referrals);
+ len = strlen(referral.node_name);
+ if (len < 2) {
+ cifs_dbg(VFS, "%s: Net Address path too short: %s\n",
+ __func__, referral.node_name);
+ mnt = ERR_PTR(-EINVAL);
+ goto free_dfs_ref;
+ }
+ /*
+ * cifs_mount() will retry every available node server in case
+ * of failures.
+ */
+ mnt = cifs_dfs_do_refmount(mntpt, cifs_sb, full_path, &referral);
+ cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n", __func__,
+ referral.node_name, mnt);
+
+free_dfs_ref:
+ free_dfs_info_param(&referral);
+free_root_path:
+ kfree(root_path);
free_full_path:
kfree(full_path);
cdda_exit:
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 63d7530..42f0d67 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -72,6 +72,15 @@ struct cifs_sb_info {
char *mountdata; /* options received at mount time or via DFS refs */
struct delayed_work prune_tlinks;
struct rcu_head rcu;
+
+ /* only used when CIFS_MOUNT_USE_PREFIX_PATH is set */
char *prepath;
+
+ /*
+ * Path initially provided by the mount call. We might connect
+ * to something different via DFS but we want to keep it to do
+ * failover properly.
+ */
+ char *origin_fullpath; /* \\HOST\SHARE\[OPTIONAL PATH] */
};
#endif /* _CIFS_FS_SB_H */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 85b31cf..d2a05e4 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -224,7 +224,7 @@ int cifs_verify_signature(struct smb_rqst *rqst,
if (cifs_pdu->Command == SMB_COM_LOCKING_ANDX) {
struct smb_com_lock_req *pSMB =
(struct smb_com_lock_req *)cifs_pdu;
- if (pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE)
+ if (pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE)
return 0;
}
@@ -304,12 +304,17 @@ int setup_ntlm_response(struct cifs_ses *ses, const struct nls_table *nls_cp)
int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
char *lnm_session_key)
{
- int i;
+ int i, len;
int rc;
char password_with_pad[CIFS_ENCPWD_SIZE] = {0};
- if (password)
- strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE);
+ if (password) {
+ for (len = 0; len < CIFS_ENCPWD_SIZE; len++)
+ if (!password[len])
+ break;
+
+ memcpy(password_with_pad, password, len);
+ }
if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) {
memcpy(lnm_session_key, password_with_pad,
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 865706e..62d48d4 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -52,6 +52,9 @@
#include "cifs_spnego.h"
#include "fscache.h"
#include "smb2pdu.h"
+#ifdef CONFIG_CIFS_DFS_UPCALL
+#include "dfs_cache.h"
+#endif
int cifsFYI = 0;
bool traceSMB;
@@ -1494,10 +1497,15 @@ init_cifs(void)
if (rc)
goto out_destroy_mids;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ rc = dfs_cache_init();
+ if (rc)
+ goto out_destroy_request_bufs;
+#endif /* CONFIG_CIFS_DFS_UPCALL */
#ifdef CONFIG_CIFS_UPCALL
rc = init_cifs_spnego();
if (rc)
- goto out_destroy_request_bufs;
+ goto out_destroy_dfs_cache;
#endif /* CONFIG_CIFS_UPCALL */
#ifdef CONFIG_CIFS_ACL
@@ -1525,6 +1533,10 @@ out_register_key_type:
#endif
#ifdef CONFIG_CIFS_UPCALL
exit_cifs_spnego();
+out_destroy_dfs_cache:
+#endif
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ dfs_cache_destroy();
out_destroy_request_bufs:
#endif
cifs_destroy_request_bufs();
@@ -1556,6 +1568,9 @@ exit_cifs(void)
#ifdef CONFIG_CIFS_UPCALL
exit_cifs_spnego();
#endif
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ dfs_cache_destroy();
+#endif
cifs_destroy_request_bufs();
cifs_destroy_mids();
cifs_destroy_inodecache();
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 4c3b5cf..26776ed 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -150,5 +150,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.14"
+#define CIFS_VERSION "2.15"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 38ab0fc..01ded70 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -701,6 +701,13 @@ struct TCP_Server_Info {
struct delayed_work reconnect; /* reconnect workqueue job */
struct mutex reconnect_mutex; /* prevent simultaneous reconnects */
unsigned long echo_interval;
+
+ /*
+ * Number of targets available for reconnect. The more targets
+ * the more tasks have to wait to let the demultiplex thread
+ * reconnect.
+ */
+ int nr_targets;
};
static inline unsigned int
@@ -1014,6 +1021,11 @@ struct cifs_tcon {
struct list_head pending_opens; /* list of incomplete opens */
struct cached_fid crfid; /* Cached root fid */
/* BB add field for back pointer to sb struct(s)? */
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ char *dfs_path;
+ int remap:2;
+ struct list_head ulist; /* cache update list */
+#endif
};
/*
@@ -1508,6 +1520,7 @@ struct dfs_info3_param {
int ref_flag;
char *path_name;
char *node_name;
+ int ttl;
};
/*
@@ -1545,7 +1558,6 @@ static inline void free_dfs_info_param(struct dfs_info3_param *param)
if (param) {
kfree(param->path_name);
kfree(param->node_name);
- kfree(param);
}
}
@@ -1790,6 +1802,7 @@ extern struct smb_version_values smb3any_values;
extern struct smb_version_operations smb30_operations;
extern struct smb_version_values smb30_values;
#define SMB302_VERSION_STRING "3.02"
+#define ALT_SMB302_VERSION_STRING "3.0.2"
/*extern struct smb_version_operations smb302_operations;*/ /* not needed yet */
extern struct smb_version_values smb302_values;
#define SMB311_VERSION_STRING "3.1.1"
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index fa361bc..336c116 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -22,6 +22,9 @@
#define _CIFSPROTO_H
#include <linux/nls.h>
#include "trace.h"
+#ifdef CONFIG_CIFS_DFS_UPCALL
+#include "dfs_cache.h"
+#endif
struct statfs;
struct smb_vol;
@@ -213,7 +216,7 @@ extern int cifs_match_super(struct super_block *, void *);
extern void cifs_cleanup_volume_info(struct smb_vol *pvolume_info);
extern struct smb_vol *cifs_get_volume_info(char *mount_data,
const char *devname, bool is_smb3);
-extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *);
+extern int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol);
extern void cifs_umount(struct cifs_sb_info *);
extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon);
extern void cifs_reopen_persistent_handles(struct cifs_tcon *tcon);
@@ -294,11 +297,6 @@ extern int CIFSGetDFSRefer(const unsigned int xid, struct cifs_ses *ses,
unsigned int *num_of_nodes,
const struct nls_table *nls_codepage, int remap);
-extern int get_dfs_path(const unsigned int xid, struct cifs_ses *ses,
- const char *old_path,
- const struct nls_table *nls_codepage,
- unsigned int *num_referrals,
- struct dfs_info3_param **referrals, int remap);
extern int parse_dfs_referrals(struct get_dfs_referral_rsp *rsp, u32 rsp_size,
unsigned int *num_of_nodes,
struct dfs_info3_param **target_nodes,
@@ -524,6 +522,11 @@ extern int E_md4hash(const unsigned char *passwd, unsigned char *p16,
const struct nls_table *codepage);
extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
unsigned char *p24);
+extern void
+cifs_cleanup_volume_info_contents(struct smb_vol *volume_info);
+
+extern struct TCP_Server_Info *
+cifs_find_tcp_session(struct smb_vol *vol);
void cifs_readdata_release(struct kref *refcount);
int cifs_async_readv(struct cifs_readdata *rdata);
@@ -562,4 +565,17 @@ void cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc);
extern void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page,
unsigned int *len, unsigned int *offset);
+void extract_unc_hostname(const char *unc, const char **h, size_t *len);
+
+#ifdef CONFIG_CIFS_DFS_UPCALL
+static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses,
+ const char *old_path,
+ const struct nls_table *nls_codepage,
+ struct dfs_info3_param *referral, int remap)
+{
+ return dfs_cache_find(xid, ses, nls_codepage, remap, old_path,
+ referral, NULL);
+}
+#endif
+
#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index f82fd34..b1f49c1 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -44,6 +44,9 @@
#include "cifs_debug.h"
#include "fscache.h"
#include "smbdirect.h"
+#ifdef CONFIG_CIFS_DFS_UPCALL
+#include "dfs_cache.h"
+#endif
#ifdef CONFIG_CIFS_POSIX
static struct {
@@ -118,6 +121,77 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
*/
}
+#ifdef CONFIG_CIFS_DFS_UPCALL
+static int __cifs_reconnect_tcon(const struct nls_table *nlsc,
+ struct cifs_tcon *tcon)
+{
+ int rc;
+ struct dfs_cache_tgt_list tl;
+ struct dfs_cache_tgt_iterator *it = NULL;
+ char tree[MAX_TREE_SIZE + 1];
+ const char *tcp_host;
+ size_t tcp_host_len;
+ const char *dfs_host;
+ size_t dfs_host_len;
+
+ if (tcon->ipc) {
+ snprintf(tree, sizeof(tree), "\\\\%s\\IPC$",
+ tcon->ses->server->hostname);
+ return CIFSTCon(0, tcon->ses, tree, tcon, nlsc);
+ }
+
+ if (!tcon->dfs_path)
+ return CIFSTCon(0, tcon->ses, tcon->treeName, tcon, nlsc);
+
+ rc = dfs_cache_noreq_find(tcon->dfs_path + 1, NULL, &tl);
+ if (rc)
+ return rc;
+
+ extract_unc_hostname(tcon->ses->server->hostname, &tcp_host,
+ &tcp_host_len);
+
+ for (it = dfs_cache_get_tgt_iterator(&tl); it;
+ it = dfs_cache_get_next_tgt(&tl, it)) {
+ const char *tgt = dfs_cache_get_tgt_name(it);
+
+ extract_unc_hostname(tgt, &dfs_host, &dfs_host_len);
+
+ if (dfs_host_len != tcp_host_len
+ || strncasecmp(dfs_host, tcp_host, dfs_host_len) != 0) {
+ cifs_dbg(FYI, "%s: skipping %.*s, doesn't match %.*s",
+ __func__,
+ (int)dfs_host_len, dfs_host,
+ (int)tcp_host_len, tcp_host);
+ continue;
+ }
+
+ snprintf(tree, sizeof(tree), "\\%s", tgt);
+
+ rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc);
+ if (!rc)
+ break;
+ if (rc == -EREMOTE)
+ break;
+ }
+
+ if (!rc) {
+ if (it)
+ rc = dfs_cache_noreq_update_tgthint(tcon->dfs_path + 1,
+ it);
+ else
+ rc = -ENOENT;
+ }
+ dfs_cache_free_tgts(&tl);
+ return rc;
+}
+#else
+static inline int __cifs_reconnect_tcon(const struct nls_table *nlsc,
+ struct cifs_tcon *tcon)
+{
+ return CIFSTCon(0, tcon->ses, tcon->treeName, tcon, nlsc);
+}
+#endif
+
/* reconnect the socket, tcon, and smb session if needed */
static int
cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
@@ -126,6 +200,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
struct cifs_ses *ses;
struct TCP_Server_Info *server;
struct nls_table *nls_codepage;
+ int retries;
/*
* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for
@@ -152,9 +227,12 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
}
}
+ retries = server->nr_targets;
+
/*
- * Give demultiplex thread up to 10 seconds to reconnect, should be
- * greater than cifs socket timeout which is 7 seconds
+ * Give demultiplex thread up to 10 seconds to each target available for
+ * reconnect -- should be greater than cifs socket timeout which is 7
+ * seconds.
*/
while (server->tcpStatus == CifsNeedReconnect) {
rc = wait_event_interruptible_timeout(server->response_q,
@@ -170,6 +248,9 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
if (server->tcpStatus != CifsNeedReconnect)
break;
+ if (--retries)
+ continue;
+
/*
* on "soft" mounts we wait once. Hard mounts keep
* retrying until process is killed or server comes
@@ -179,6 +260,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n");
return -EHOSTDOWN;
}
+ retries = server->nr_targets;
}
if (!ses->need_reconnect && !tcon->need_reconnect)
@@ -214,7 +296,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
}
cifs_mark_open_files_invalid(tcon);
- rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
+ rc = __cifs_reconnect_tcon(nls_codepage, tcon);
mutex_unlock(&ses->session_mutex);
cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 6f24f12..f665296 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -56,6 +56,11 @@
#include "fscache.h"
#include "smb2proto.h"
#include "smbdirect.h"
+#include "dns_resolve.h"
+#include "cifsfs.h"
+#ifdef CONFIG_CIFS_DFS_UPCALL
+#include "dfs_cache.h"
+#endif
extern mempool_t *cifs_req_poolp;
extern bool disable_legacy_dialects;
@@ -304,6 +309,7 @@ static const match_table_t cifs_smb_version_tokens = {
{ Smb_21, SMB21_VERSION_STRING },
{ Smb_30, SMB30_VERSION_STRING },
{ Smb_302, SMB302_VERSION_STRING },
+ { Smb_302, ALT_SMB302_VERSION_STRING },
{ Smb_311, SMB311_VERSION_STRING },
{ Smb_311, ALT_SMB311_VERSION_STRING },
{ Smb_3any, SMB3ANY_VERSION_STRING },
@@ -317,6 +323,131 @@ static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
static void cifs_prune_tlinks(struct work_struct *work);
static int cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
const char *devname, bool is_smb3);
+static char *extract_hostname(const char *unc);
+
+/*
+ * Resolve hostname and set ip addr in tcp ses. Useful for hostnames that may
+ * get their ip addresses changed at some point.
+ *
+ * This should be called with server->srv_mutex held.
+ */
+#ifdef CONFIG_CIFS_DFS_UPCALL
+static int reconn_set_ipaddr(struct TCP_Server_Info *server)
+{
+ int rc;
+ int len;
+ char *unc, *ipaddr = NULL;
+
+ if (!server->hostname)
+ return -EINVAL;
+
+ len = strlen(server->hostname) + 3;
+
+ unc = kmalloc(len, GFP_KERNEL);
+ if (!unc) {
+ cifs_dbg(FYI, "%s: failed to create UNC path\n", __func__);
+ return -ENOMEM;
+ }
+ snprintf(unc, len, "\\\\%s", server->hostname);
+
+ rc = dns_resolve_server_name_to_ip(unc, &ipaddr);
+ kfree(unc);
+
+ if (rc < 0) {
+ cifs_dbg(FYI, "%s: failed to resolve server part of %s to IP: %d\n",
+ __func__, server->hostname, rc);
+ return rc;
+ }
+
+ rc = cifs_convert_address((struct sockaddr *)&server->dstaddr, ipaddr,
+ strlen(ipaddr));
+ kfree(ipaddr);
+
+ return !rc ? -1 : 0;
+}
+#else
+static inline int reconn_set_ipaddr(struct TCP_Server_Info *server)
+{
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_CIFS_DFS_UPCALL
+struct super_cb_data {
+ struct TCP_Server_Info *server;
+ struct cifs_sb_info *cifs_sb;
+};
+
+/* These functions must be called with server->srv_mutex held */
+
+static void super_cb(struct super_block *sb, void *arg)
+{
+ struct super_cb_data *d = arg;
+ struct cifs_sb_info *cifs_sb;
+ struct cifs_tcon *tcon;
+
+ if (d->cifs_sb)
+ return;
+
+ cifs_sb = CIFS_SB(sb);
+ tcon = cifs_sb_master_tcon(cifs_sb);
+ if (tcon->ses->server == d->server)
+ d->cifs_sb = cifs_sb;
+}
+
+static inline struct cifs_sb_info *
+find_super_by_tcp(struct TCP_Server_Info *server)
+{
+ struct super_cb_data d = {
+ .server = server,
+ .cifs_sb = NULL,
+ };
+
+ iterate_supers_type(&cifs_fs_type, super_cb, &d);
+ return d.cifs_sb ? d.cifs_sb : ERR_PTR(-ENOENT);
+}
+
+static void reconn_inval_dfs_target(struct TCP_Server_Info *server,
+ struct cifs_sb_info *cifs_sb,
+ struct dfs_cache_tgt_list *tgt_list,
+ struct dfs_cache_tgt_iterator **tgt_it)
+{
+ const char *name;
+
+ if (!cifs_sb || !cifs_sb->origin_fullpath || !tgt_list ||
+ !server->nr_targets)
+ return;
+
+ if (!*tgt_it) {
+ *tgt_it = dfs_cache_get_tgt_iterator(tgt_list);
+ } else {
+ *tgt_it = dfs_cache_get_next_tgt(tgt_list, *tgt_it);
+ if (!*tgt_it)
+ *tgt_it = dfs_cache_get_tgt_iterator(tgt_list);
+ }
+
+ cifs_dbg(FYI, "%s: UNC: %s\n", __func__, cifs_sb->origin_fullpath);
+
+ name = dfs_cache_get_tgt_name(*tgt_it);
+
+ kfree(server->hostname);
+
+ server->hostname = extract_hostname(name);
+ if (!server->hostname) {
+ cifs_dbg(FYI, "%s: failed to extract hostname from target: %d\n",
+ __func__, -ENOMEM);
+ }
+}
+
+static inline int reconn_setup_dfs_targets(struct cifs_sb_info *cifs_sb,
+ struct dfs_cache_tgt_list *tl,
+ struct dfs_cache_tgt_iterator **it)
+{
+ if (!cifs_sb->origin_fullpath)
+ return -EOPNOTSUPP;
+ return dfs_cache_noreq_find(cifs_sb->origin_fullpath + 1, NULL, tl);
+}
+#endif
/*
* cifs tcp session reconnection
@@ -335,8 +466,33 @@ cifs_reconnect(struct TCP_Server_Info *server)
struct cifs_tcon *tcon;
struct mid_q_entry *mid_entry;
struct list_head retry_list;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ struct cifs_sb_info *cifs_sb = NULL;
+ struct dfs_cache_tgt_list tgt_list = {0};
+ struct dfs_cache_tgt_iterator *tgt_it = NULL;
+#endif
spin_lock(&GlobalMid_Lock);
+ server->nr_targets = 1;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ cifs_sb = find_super_by_tcp(server);
+ if (IS_ERR(cifs_sb)) {
+ rc = PTR_ERR(cifs_sb);
+ cifs_dbg(FYI, "%s: will not do DFS failover: rc = %d\n",
+ __func__, rc);
+ cifs_sb = NULL;
+ } else {
+ rc = reconn_setup_dfs_targets(cifs_sb, &tgt_list, &tgt_it);
+ if (rc && (rc != -EOPNOTSUPP)) {
+ cifs_dbg(VFS, "%s: no target servers for DFS failover\n",
+ __func__);
+ } else {
+ server->nr_targets = dfs_cache_get_nr_tgts(&tgt_list);
+ }
+ }
+ cifs_dbg(FYI, "%s: will retry %d target(s)\n", __func__,
+ server->nr_targets);
+#endif
if (server->tcpStatus == CifsExiting) {
/* the demux thread will exit normally
next time through the loop */
@@ -410,14 +566,27 @@ cifs_reconnect(struct TCP_Server_Info *server)
do {
try_to_freeze();
- /* we should try only the port we connected to before */
mutex_lock(&server->srv_mutex);
+ /*
+ * Set up next DFS target server (if any) for reconnect. If DFS
+ * feature is disabled, then we will retry last server we
+ * connected to before.
+ */
if (cifs_rdma_enabled(server))
rc = smbd_reconnect(server);
else
rc = generic_ip_connect(server);
if (rc) {
cifs_dbg(FYI, "reconnect error %d\n", rc);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ reconn_inval_dfs_target(server, cifs_sb, &tgt_list,
+ &tgt_it);
+#endif
+ rc = reconn_set_ipaddr(server);
+ if (rc) {
+ cifs_dbg(FYI, "%s: failed to resolve hostname: %d\n",
+ __func__, rc);
+ }
mutex_unlock(&server->srv_mutex);
msleep(3000);
} else {
@@ -430,6 +599,22 @@ cifs_reconnect(struct TCP_Server_Info *server)
}
} while (server->tcpStatus == CifsNeedReconnect);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ if (tgt_it) {
+ rc = dfs_cache_noreq_update_tgthint(cifs_sb->origin_fullpath + 1,
+ tgt_it);
+ if (rc) {
+ cifs_dbg(VFS, "%s: failed to update DFS target hint: rc = %d\n",
+ __func__, rc);
+ }
+ rc = dfs_cache_update_vol(cifs_sb->origin_fullpath, server);
+ if (rc) {
+ cifs_dbg(VFS, "%s: failed to update vol info in DFS cache: rc = %d\n",
+ __func__, rc);
+ }
+ dfs_cache_free_tgts(&tgt_list);
+ }
+#endif
if (server->tcpStatus == CifsNeedNegotiate)
mod_delayed_work(cifsiod_wq, &server->echo, 0);
@@ -1043,7 +1228,12 @@ extract_hostname(const char *unc)
/* skip double chars at beginning of string */
/* BB: check validity of these bytes? */
- src = unc + 2;
+ if (strlen(unc) < 3)
+ return ERR_PTR(-EINVAL);
+ for (src = unc; *src && *src == '\\'; src++)
+ ;
+ if (!*src)
+ return ERR_PTR(-EINVAL);
/* delimiter between hostname and sharename is always '\\' now */
delim = strchr(src, '\\');
@@ -1827,7 +2017,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->password = NULL;
break;
}
- /* Yes it is. Drop down to Opt_pass below.*/
+ /* Fallthrough - to Opt_pass below.*/
case Opt_pass:
/* Obtain the value string */
value = strchr(data, '=');
@@ -2289,7 +2479,7 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol)
return 1;
}
-static struct TCP_Server_Info *
+struct TCP_Server_Info *
cifs_find_tcp_session(struct smb_vol *vol)
{
struct TCP_Server_Info *server;
@@ -2461,6 +2651,8 @@ smbd_connected:
}
tcp_ses->tcpStatus = CifsNeedNegotiate;
+ tcp_ses->nr_targets = 1;
+
/* thread spawned, put it on the list */
spin_lock(&cifs_tcp_ses_lock);
list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list);
@@ -3256,25 +3448,6 @@ out:
return rc;
}
-int
-get_dfs_path(const unsigned int xid, struct cifs_ses *ses, const char *old_path,
- const struct nls_table *nls_codepage, unsigned int *num_referrals,
- struct dfs_info3_param **referrals, int remap)
-{
- int rc = 0;
-
- if (!ses->server->ops->get_dfs_refer)
- return -ENOSYS;
-
- *num_referrals = 0;
- *referrals = NULL;
-
- rc = ses->server->ops->get_dfs_refer(xid, ses, old_path,
- referrals, num_referrals,
- nls_codepage, remap);
- return rc;
-}
-
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key cifs_key[2];
static struct lock_class_key cifs_slock_key[2];
@@ -3746,8 +3919,8 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
return 0;
}
-static void
-cleanup_volume_info_contents(struct smb_vol *volume_info)
+void
+cifs_cleanup_volume_info_contents(struct smb_vol *volume_info)
{
kfree(volume_info->username);
kzfree(volume_info->password);
@@ -3762,10 +3935,136 @@ cifs_cleanup_volume_info(struct smb_vol *volume_info)
{
if (!volume_info)
return;
- cleanup_volume_info_contents(volume_info);
+ cifs_cleanup_volume_info_contents(volume_info);
kfree(volume_info);
}
+/* Release all succeed connections */
+static inline void mount_put_conns(struct cifs_sb_info *cifs_sb,
+ unsigned int xid,
+ struct TCP_Server_Info *server,
+ struct cifs_ses *ses, struct cifs_tcon *tcon)
+{
+ int rc = 0;
+
+ if (tcon)
+ cifs_put_tcon(tcon);
+ else if (ses)
+ cifs_put_smb_ses(ses);
+ else if (server)
+ cifs_put_tcp_session(server, 0);
+ cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_POSIX_PATHS;
+ free_xid(xid);
+}
+
+/* Get connections for tcp, ses and tcon */
+static int mount_get_conns(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
+ unsigned int *xid,
+ struct TCP_Server_Info **nserver,
+ struct cifs_ses **nses, struct cifs_tcon **ntcon)
+{
+ int rc = 0;
+ struct TCP_Server_Info *server;
+ struct cifs_ses *ses;
+ struct cifs_tcon *tcon;
+
+ *nserver = NULL;
+ *nses = NULL;
+ *ntcon = NULL;
+
+ *xid = get_xid();
+
+ /* get a reference to a tcp session */
+ server = cifs_get_tcp_session(vol);
+ if (IS_ERR(server)) {
+ rc = PTR_ERR(server);
+ return rc;
+ }
+
+ *nserver = server;
+
+ if ((vol->max_credits < 20) || (vol->max_credits > 60000))
+ server->max_credits = SMB2_MAX_CREDITS_AVAILABLE;
+ else
+ server->max_credits = vol->max_credits;
+
+ /* get a reference to a SMB session */
+ ses = cifs_get_smb_ses(server, vol);
+ if (IS_ERR(ses)) {
+ rc = PTR_ERR(ses);
+ return rc;
+ }
+
+ *nses = ses;
+
+ if ((vol->persistent == true) && (!(ses->server->capabilities &
+ SMB2_GLOBAL_CAP_PERSISTENT_HANDLES))) {
+ cifs_dbg(VFS, "persistent handles not supported by server\n");
+ return -EOPNOTSUPP;
+ }
+
+ /* search for existing tcon to this server share */
+ tcon = cifs_get_tcon(ses, vol);
+ if (IS_ERR(tcon)) {
+ rc = PTR_ERR(tcon);
+ return rc;
+ }
+
+ *ntcon = tcon;
+
+ /* if new SMB3.11 POSIX extensions are supported do not remap / and \ */
+ if (tcon->posix_extensions)
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS;
+
+ /* tell server which Unix caps we support */
+ if (cap_unix(tcon->ses)) {
+ /*
+ * reset of caps checks mount to see if unix extensions disabled
+ * for just this mount.
+ */
+ reset_cifs_unix_caps(*xid, tcon, cifs_sb, vol);
+ if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) &&
+ (le64_to_cpu(tcon->fsUnixInfo.Capability) &
+ CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP))
+ return -EACCES;
+ } else
+ tcon->unix_ext = 0; /* server does not support them */
+
+ /* do not care if a following call succeed - informational */
+ if (!tcon->pipe && server->ops->qfs_tcon)
+ server->ops->qfs_tcon(*xid, tcon);
+
+ cifs_sb->wsize = server->ops->negotiate_wsize(tcon, vol);
+ cifs_sb->rsize = server->ops->negotiate_rsize(tcon, vol);
+
+ return 0;
+}
+
+static int mount_setup_tlink(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
+ struct cifs_tcon *tcon)
+{
+ struct tcon_link *tlink;
+
+ /* hang the tcon off of the superblock */
+ tlink = kzalloc(sizeof(*tlink), GFP_KERNEL);
+ if (tlink == NULL)
+ return -ENOMEM;
+
+ tlink->tl_uid = ses->linux_uid;
+ tlink->tl_tcon = tcon;
+ tlink->tl_time = jiffies;
+ set_bit(TCON_LINK_MASTER, &tlink->tl_flags);
+ set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
+
+ cifs_sb->master_tlink = tlink;
+ spin_lock(&cifs_sb->tlink_tree_lock);
+ tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
+ spin_unlock(&cifs_sb->tlink_tree_lock);
+
+ queue_delayed_work(cifsiod_wq, &cifs_sb->prune_tlinks,
+ TLINK_IDLE_EXPIRE);
+ return 0;
+}
#ifdef CONFIG_CIFS_DFS_UPCALL
/*
@@ -3774,10 +4073,11 @@ cifs_cleanup_volume_info(struct smb_vol *volume_info)
*/
static char *
build_unc_path_to_root(const struct smb_vol *vol,
- const struct cifs_sb_info *cifs_sb)
+ const struct cifs_sb_info *cifs_sb, bool useppath)
{
char *full_path, *pos;
- unsigned int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0;
+ unsigned int pplen = useppath && vol->prepath ?
+ strlen(vol->prepath) + 1 : 0;
unsigned int unc_len = strnlen(vol->UNC, MAX_TREE_SIZE + 1);
full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL);
@@ -3799,8 +4099,9 @@ build_unc_path_to_root(const struct smb_vol *vol,
return full_path;
}
-/*
- * Perform a dfs referral query for a share and (optionally) prefix
+/**
+ * expand_dfs_referral - Perform a dfs referral query and update the cifs_sb
+ *
*
* If a referral is found, cifs_sb->mountdata will be (re-)allocated
* to a string containing updated options for the submount. Otherwise it
@@ -3815,39 +4116,36 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses,
int check_prefix)
{
int rc;
- unsigned int num_referrals = 0;
- struct dfs_info3_param *referrals = NULL;
+ struct dfs_info3_param referral = {0};
char *full_path = NULL, *ref_path = NULL, *mdata = NULL;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS)
return -EREMOTE;
- full_path = build_unc_path_to_root(volume_info, cifs_sb);
+ full_path = build_unc_path_to_root(volume_info, cifs_sb, true);
if (IS_ERR(full_path))
return PTR_ERR(full_path);
/* For DFS paths, skip the first '\' of the UNC */
ref_path = check_prefix ? full_path + 1 : volume_info->UNC + 1;
- rc = get_dfs_path(xid, ses, ref_path, cifs_sb->local_nls,
- &num_referrals, &referrals, cifs_remap(cifs_sb));
-
- if (!rc && num_referrals > 0) {
+ rc = dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb),
+ ref_path, &referral, NULL);
+ if (!rc) {
char *fake_devname = NULL;
mdata = cifs_compose_mount_options(cifs_sb->mountdata,
- full_path + 1, referrals,
+ full_path + 1, &referral,
&fake_devname);
-
- free_dfs_info_array(referrals, num_referrals);
+ free_dfs_info_param(&referral);
if (IS_ERR(mdata)) {
rc = PTR_ERR(mdata);
mdata = NULL;
} else {
- cleanup_volume_info_contents(volume_info);
+ cifs_cleanup_volume_info_contents(volume_info);
rc = cifs_setup_volume_info(volume_info, mdata,
- fake_devname, false);
+ fake_devname, false);
}
kfree(fake_devname);
kfree(cifs_sb->mountdata);
@@ -3856,6 +4154,143 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses,
kfree(full_path);
return rc;
}
+
+static inline int get_next_dfs_tgt(const char *path,
+ struct dfs_cache_tgt_list *tgt_list,
+ struct dfs_cache_tgt_iterator **tgt_it)
+{
+ if (!*tgt_it)
+ *tgt_it = dfs_cache_get_tgt_iterator(tgt_list);
+ else
+ *tgt_it = dfs_cache_get_next_tgt(tgt_list, *tgt_it);
+ return !*tgt_it ? -EHOSTDOWN : 0;
+}
+
+static int update_vol_info(const struct dfs_cache_tgt_iterator *tgt_it,
+ struct smb_vol *fake_vol, struct smb_vol *vol)
+{
+ const char *tgt = dfs_cache_get_tgt_name(tgt_it);
+ int len = strlen(tgt) + 2;
+ char *new_unc;
+
+ new_unc = kmalloc(len, GFP_KERNEL);
+ if (!new_unc)
+ return -ENOMEM;
+ snprintf(new_unc, len, "\\%s", tgt);
+
+ kfree(vol->UNC);
+ vol->UNC = new_unc;
+
+ if (fake_vol->prepath) {
+ kfree(vol->prepath);
+ vol->prepath = fake_vol->prepath;
+ fake_vol->prepath = NULL;
+ }
+ memcpy(&vol->dstaddr, &fake_vol->dstaddr, sizeof(vol->dstaddr));
+
+ return 0;
+}
+
+static int setup_dfs_tgt_conn(const char *path,
+ const struct dfs_cache_tgt_iterator *tgt_it,
+ struct cifs_sb_info *cifs_sb,
+ struct smb_vol *vol,
+ unsigned int *xid,
+ struct TCP_Server_Info **server,
+ struct cifs_ses **ses,
+ struct cifs_tcon **tcon)
+{
+ int rc;
+ struct dfs_info3_param ref = {0};
+ char *mdata = NULL, *fake_devname = NULL;
+ struct smb_vol fake_vol = {0};
+
+ cifs_dbg(FYI, "%s: dfs path: %s\n", __func__, path);
+
+ rc = dfs_cache_get_tgt_referral(path, tgt_it, &ref);
+ if (rc)
+ return rc;
+
+ mdata = cifs_compose_mount_options(cifs_sb->mountdata, path, &ref,
+ &fake_devname);
+ free_dfs_info_param(&ref);
+
+ if (IS_ERR(mdata)) {
+ rc = PTR_ERR(mdata);
+ mdata = NULL;
+ } else {
+ cifs_dbg(FYI, "%s: fake_devname: %s\n", __func__, fake_devname);
+ rc = cifs_setup_volume_info(&fake_vol, mdata, fake_devname,
+ false);
+ }
+ kfree(mdata);
+ kfree(fake_devname);
+
+ if (!rc) {
+ /*
+ * We use a 'fake_vol' here because we need pass it down to the
+ * mount_{get,put} functions to test connection against new DFS
+ * targets.
+ */
+ mount_put_conns(cifs_sb, *xid, *server, *ses, *tcon);
+ rc = mount_get_conns(&fake_vol, cifs_sb, xid, server, ses,
+ tcon);
+ if (!rc) {
+ /*
+ * We were able to connect to new target server.
+ * Update current volume info with new target server.
+ */
+ rc = update_vol_info(tgt_it, &fake_vol, vol);
+ }
+ }
+ cifs_cleanup_volume_info_contents(&fake_vol);
+ return rc;
+}
+
+static int mount_do_dfs_failover(const char *path,
+ struct cifs_sb_info *cifs_sb,
+ struct smb_vol *vol,
+ struct cifs_ses *root_ses,
+ unsigned int *xid,
+ struct TCP_Server_Info **server,
+ struct cifs_ses **ses,
+ struct cifs_tcon **tcon)
+{
+ int rc;
+ struct dfs_cache_tgt_list tgt_list;
+ struct dfs_cache_tgt_iterator *tgt_it = NULL;
+
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS)
+ return -EOPNOTSUPP;
+
+ rc = dfs_cache_noreq_find(path, NULL, &tgt_list);
+ if (rc)
+ return rc;
+
+ for (;;) {
+ /* Get next DFS target server - if any */
+ rc = get_next_dfs_tgt(path, &tgt_list, &tgt_it);
+ if (rc)
+ break;
+ /* Connect to next DFS target */
+ rc = setup_dfs_tgt_conn(path, tgt_it, cifs_sb, vol, xid, server,
+ ses, tcon);
+ if (!rc || rc == -EACCES || rc == -EOPNOTSUPP)
+ break;
+ }
+ if (!rc) {
+ /*
+ * Update DFS target hint in DFS referral cache with the target
+ * server we successfully reconnected to.
+ */
+ rc = dfs_cache_update_tgthint(*xid, root_ses ? root_ses : *ses,
+ cifs_sb->local_nls,
+ cifs_remap(cifs_sb), path,
+ tgt_it);
+ }
+ dfs_cache_free_tgts(&tgt_list);
+ return rc;
+}
#endif
static int
@@ -3954,107 +4389,108 @@ cifs_are_all_path_components_accessible(struct TCP_Server_Info *server,
return rc;
}
-int
-cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
+/*
+ * Check if path is remote (e.g. a DFS share). Return -EREMOTE if it is,
+ * otherwise 0.
+ */
+static int is_path_remote(struct cifs_sb_info *cifs_sb, struct smb_vol *vol,
+ const unsigned int xid,
+ struct TCP_Server_Info *server,
+ struct cifs_tcon *tcon)
{
int rc;
- unsigned int xid;
- struct cifs_ses *ses;
- struct cifs_tcon *tcon;
- struct TCP_Server_Info *server;
- char *full_path;
- struct tcon_link *tlink;
-#ifdef CONFIG_CIFS_DFS_UPCALL
- int referral_walks_count = 0;
-#endif
-
-#ifdef CONFIG_CIFS_DFS_UPCALL
-try_mount_again:
- /* cleanup activities if we're chasing a referral */
- if (referral_walks_count) {
- if (tcon)
- cifs_put_tcon(tcon);
- else if (ses)
- cifs_put_smb_ses(ses);
+ char *full_path;
- cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_POSIX_PATHS;
+ if (!server->ops->is_path_accessible)
+ return -EOPNOTSUPP;
- free_xid(xid);
- }
-#endif
- rc = 0;
- tcon = NULL;
- ses = NULL;
- server = NULL;
- full_path = NULL;
- tlink = NULL;
+ /*
+ * cifs_build_path_to_root works only when we have a valid tcon
+ */
+ full_path = cifs_build_path_to_root(vol, cifs_sb, tcon,
+ tcon->Flags & SMB_SHARE_IS_IN_DFS);
+ if (full_path == NULL)
+ return -ENOMEM;
- xid = get_xid();
+ cifs_dbg(FYI, "%s: full_path: %s\n", __func__, full_path);
- /* get a reference to a tcp session */
- server = cifs_get_tcp_session(volume_info);
- if (IS_ERR(server)) {
- rc = PTR_ERR(server);
- goto out;
- }
- if ((volume_info->max_credits < 20) ||
- (volume_info->max_credits > 60000))
- server->max_credits = SMB2_MAX_CREDITS_AVAILABLE;
- else
- server->max_credits = volume_info->max_credits;
- /* get a reference to a SMB session */
- ses = cifs_get_smb_ses(server, volume_info);
- if (IS_ERR(ses)) {
- rc = PTR_ERR(ses);
- ses = NULL;
- goto mount_fail_check;
+ rc = server->ops->is_path_accessible(xid, tcon, cifs_sb,
+ full_path);
+ if (rc != 0 && rc != -EREMOTE) {
+ kfree(full_path);
+ return rc;
}
- if ((volume_info->persistent == true) && ((ses->server->capabilities &
- SMB2_GLOBAL_CAP_PERSISTENT_HANDLES) == 0)) {
- cifs_dbg(VFS, "persistent handles not supported by server\n");
- rc = -EOPNOTSUPP;
- goto mount_fail_check;
+ if (rc != -EREMOTE) {
+ rc = cifs_are_all_path_components_accessible(server, xid, tcon,
+ cifs_sb,
+ full_path);
+ if (rc != 0) {
+ cifs_dbg(VFS, "cannot query dirs between root and final path, "
+ "enabling CIFS_MOUNT_USE_PREFIX_PATH\n");
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
+ rc = 0;
+ }
}
- /* search for existing tcon to this server share */
- tcon = cifs_get_tcon(ses, volume_info);
- if (IS_ERR(tcon)) {
- rc = PTR_ERR(tcon);
- tcon = NULL;
- if (rc == -EACCES)
- goto mount_fail_check;
-
- goto remote_path_check;
- }
+ kfree(full_path);
+ return rc;
+}
- /* if new SMB3.11 POSIX extensions are supported do not remap / and \ */
- if (tcon->posix_extensions)
- cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol)
+{
+ int rc = 0;
+ unsigned int xid;
+ struct cifs_ses *ses;
+ struct cifs_tcon *root_tcon = NULL;
+ struct cifs_tcon *tcon = NULL;
+ struct TCP_Server_Info *server;
+ char *root_path = NULL, *full_path = NULL;
+ char *old_mountdata;
+ int count;
- /* tell server which Unix caps we support */
- if (cap_unix(tcon->ses)) {
- /* reset of caps checks mount to see if unix extensions
- disabled for just this mount */
- reset_cifs_unix_caps(xid, tcon, cifs_sb, volume_info);
- if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) &&
- (le64_to_cpu(tcon->fsUnixInfo.Capability) &
- CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)) {
- rc = -EACCES;
- goto mount_fail_check;
+ rc = mount_get_conns(vol, cifs_sb, &xid, &server, &ses, &tcon);
+ if (!rc && tcon) {
+ /* If not a standalone DFS root, then check if path is remote */
+ rc = dfs_cache_find(xid, ses, cifs_sb->local_nls,
+ cifs_remap(cifs_sb), vol->UNC + 1, NULL,
+ NULL);
+ if (rc) {
+ rc = is_path_remote(cifs_sb, vol, xid, server, tcon);
+ if (!rc)
+ goto out;
+ if (rc != -EREMOTE)
+ goto error;
}
- } else
- tcon->unix_ext = 0; /* server does not support them */
-
- /* do not care if a following call succeed - informational */
- if (!tcon->pipe && server->ops->qfs_tcon)
- server->ops->qfs_tcon(xid, tcon);
-
- cifs_sb->wsize = server->ops->negotiate_wsize(tcon, volume_info);
- cifs_sb->rsize = server->ops->negotiate_rsize(tcon, volume_info);
+ }
+ /*
+ * If first DFS target server went offline and we failed to connect it,
+ * server and ses pointers are NULL at this point, though we still have
+ * chance to get a cached DFS referral in expand_dfs_referral() and
+ * retry next target available in it.
+ *
+ * If a NULL ses ptr is passed to dfs_cache_find(), a lookup will be
+ * performed against DFS path and *no* requests will be sent to server
+ * for any new DFS referrals. Hence it's safe to skip checking whether
+ * server or ses ptr is NULL.
+ */
+ if (rc == -EACCES || rc == -EOPNOTSUPP)
+ goto error;
+
+ root_path = build_unc_path_to_root(vol, cifs_sb, false);
+ if (IS_ERR(root_path)) {
+ rc = PTR_ERR(root_path);
+ root_path = NULL;
+ goto error;
+ }
-remote_path_check:
-#ifdef CONFIG_CIFS_DFS_UPCALL
+ full_path = build_unc_path_to_root(vol, cifs_sb, true);
+ if (IS_ERR(full_path)) {
+ rc = PTR_ERR(full_path);
+ full_path = NULL;
+ goto error;
+ }
/*
* Perform an unconditional check for whether there are DFS
* referrals for this path without prefix, to provide support
@@ -4062,119 +4498,173 @@ remote_path_check:
* with PATH_NOT_COVERED to requests that include the prefix.
* Chase the referral if found, otherwise continue normally.
*/
- if (referral_walks_count == 0) {
- int refrc = expand_dfs_referral(xid, ses, volume_info, cifs_sb,
- false);
- if (!refrc) {
- referral_walks_count++;
- goto try_mount_again;
- }
+ old_mountdata = cifs_sb->mountdata;
+ (void)expand_dfs_referral(xid, ses, vol, cifs_sb, false);
+
+ if (cifs_sb->mountdata == NULL) {
+ rc = -ENOENT;
+ goto error;
}
-#endif
- /* check if a whole path is not remote */
- if (!rc && tcon) {
- if (!server->ops->is_path_accessible) {
- rc = -ENOSYS;
- goto mount_fail_check;
+ if (cifs_sb->mountdata != old_mountdata) {
+ /* If we were redirected, reconnect to new target server */
+ mount_put_conns(cifs_sb, xid, server, ses, tcon);
+ rc = mount_get_conns(vol, cifs_sb, &xid, &server, &ses, &tcon);
+ }
+ if (rc) {
+ if (rc == -EACCES || rc == -EOPNOTSUPP)
+ goto error;
+ /* Perform DFS failover to any other DFS targets */
+ rc = mount_do_dfs_failover(root_path + 1, cifs_sb, vol, NULL,
+ &xid, &server, &ses, &tcon);
+ if (rc)
+ goto error;
+ }
+
+ kfree(root_path);
+ root_path = build_unc_path_to_root(vol, cifs_sb, false);
+ if (IS_ERR(root_path)) {
+ rc = PTR_ERR(root_path);
+ root_path = NULL;
+ goto error;
+ }
+ /* Cache out resolved root server */
+ (void)dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb),
+ root_path + 1, NULL, NULL);
+ /*
+ * Save root tcon for additional DFS requests to update or create a new
+ * DFS cache entry, or even perform DFS failover.
+ */
+ spin_lock(&cifs_tcp_ses_lock);
+ tcon->tc_count++;
+ tcon->dfs_path = root_path;
+ root_path = NULL;
+ tcon->remap = cifs_remap(cifs_sb);
+ spin_unlock(&cifs_tcp_ses_lock);
+
+ root_tcon = tcon;
+
+ for (count = 1; ;) {
+ if (!rc && tcon) {
+ rc = is_path_remote(cifs_sb, vol, xid, server, tcon);
+ if (!rc || rc != -EREMOTE)
+ break;
}
/*
- * cifs_build_path_to_root works only when we have a valid tcon
+ * BB: when we implement proper loop detection,
+ * we will remove this check. But now we need it
+ * to prevent an indefinite loop if 'DFS tree' is
+ * misconfigured (i.e. has loops).
*/
- full_path = cifs_build_path_to_root(volume_info, cifs_sb, tcon,
- tcon->Flags & SMB_SHARE_IS_IN_DFS);
- if (full_path == NULL) {
- rc = -ENOMEM;
- goto mount_fail_check;
- }
- rc = server->ops->is_path_accessible(xid, tcon, cifs_sb,
- full_path);
- if (rc != 0 && rc != -EREMOTE) {
- kfree(full_path);
- goto mount_fail_check;
+ if (count++ > MAX_NESTED_LINKS) {
+ rc = -ELOOP;
+ break;
}
- if (rc != -EREMOTE) {
- rc = cifs_are_all_path_components_accessible(server,
- xid, tcon, cifs_sb,
- full_path);
- if (rc != 0) {
- cifs_dbg(VFS, "cannot query dirs between root and final path, "
- "enabling CIFS_MOUNT_USE_PREFIX_PATH\n");
- cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
- rc = 0;
- }
- }
kfree(full_path);
- }
-
- /* get referral if needed */
- if (rc == -EREMOTE) {
-#ifdef CONFIG_CIFS_DFS_UPCALL
- if (referral_walks_count > MAX_NESTED_LINKS) {
- /*
- * BB: when we implement proper loop detection,
- * we will remove this check. But now we need it
- * to prevent an indefinite loop if 'DFS tree' is
- * misconfigured (i.e. has loops).
- */
- rc = -ELOOP;
- goto mount_fail_check;
+ full_path = build_unc_path_to_root(vol, cifs_sb, true);
+ if (IS_ERR(full_path)) {
+ rc = PTR_ERR(full_path);
+ full_path = NULL;
+ break;
}
- rc = expand_dfs_referral(xid, ses, volume_info, cifs_sb, true);
+ old_mountdata = cifs_sb->mountdata;
+ rc = expand_dfs_referral(xid, root_tcon->ses, vol, cifs_sb,
+ true);
+ if (rc)
+ break;
- if (!rc) {
- referral_walks_count++;
- goto try_mount_again;
+ if (cifs_sb->mountdata != old_mountdata) {
+ mount_put_conns(cifs_sb, xid, server, ses, tcon);
+ rc = mount_get_conns(vol, cifs_sb, &xid, &server, &ses,
+ &tcon);
+ }
+ if (rc) {
+ if (rc == -EACCES || rc == -EOPNOTSUPP)
+ break;
+ /* Perform DFS failover to any other DFS targets */
+ rc = mount_do_dfs_failover(full_path + 1, cifs_sb, vol,
+ root_tcon->ses, &xid,
+ &server, &ses, &tcon);
+ if (rc == -EACCES || rc == -EOPNOTSUPP || !server ||
+ !ses)
+ goto error;
}
- goto mount_fail_check;
-#else /* No DFS support, return error on mount */
- rc = -EOPNOTSUPP;
-#endif
}
+ cifs_put_tcon(root_tcon);
if (rc)
- goto mount_fail_check;
+ goto error;
- /* now, hang the tcon off of the superblock */
- tlink = kzalloc(sizeof *tlink, GFP_KERNEL);
- if (tlink == NULL) {
+ spin_lock(&cifs_tcp_ses_lock);
+ if (!tcon->dfs_path) {
+ /* Save full path in new tcon to do failover when reconnecting tcons */
+ tcon->dfs_path = full_path;
+ full_path = NULL;
+ tcon->remap = cifs_remap(cifs_sb);
+ }
+ cifs_sb->origin_fullpath = kstrndup(tcon->dfs_path,
+ strlen(tcon->dfs_path),
+ GFP_ATOMIC);
+ if (!cifs_sb->origin_fullpath) {
+ spin_unlock(&cifs_tcp_ses_lock);
rc = -ENOMEM;
- goto mount_fail_check;
+ goto error;
}
+ spin_unlock(&cifs_tcp_ses_lock);
- tlink->tl_uid = ses->linux_uid;
- tlink->tl_tcon = tcon;
- tlink->tl_time = jiffies;
- set_bit(TCON_LINK_MASTER, &tlink->tl_flags);
- set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
+ rc = dfs_cache_add_vol(vol, cifs_sb->origin_fullpath);
+ if (rc) {
+ kfree(cifs_sb->origin_fullpath);
+ goto error;
+ }
+ /*
+ * After reconnecting to a different server, unique ids won't
+ * match anymore, so we disable serverino. This prevents
+ * dentry revalidation to think the dentry are stale (ESTALE).
+ */
+ cifs_autodisable_serverino(cifs_sb);
+out:
+ free_xid(xid);
+ return mount_setup_tlink(cifs_sb, ses, tcon);
- cifs_sb->master_tlink = tlink;
- spin_lock(&cifs_sb->tlink_tree_lock);
- tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
- spin_unlock(&cifs_sb->tlink_tree_lock);
+error:
+ kfree(full_path);
+ kfree(root_path);
+ mount_put_conns(cifs_sb, xid, server, ses, tcon);
+ return rc;
+}
+#else
+int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol)
+{
+ int rc = 0;
+ unsigned int xid;
+ struct cifs_ses *ses;
+ struct cifs_tcon *tcon;
+ struct TCP_Server_Info *server;
- queue_delayed_work(cifsiod_wq, &cifs_sb->prune_tlinks,
- TLINK_IDLE_EXPIRE);
+ rc = mount_get_conns(vol, cifs_sb, &xid, &server, &ses, &tcon);
+ if (rc)
+ goto error;
-mount_fail_check:
- /* on error free sesinfo and tcon struct if needed */
- if (rc) {
- /* If find_unc succeeded then rc == 0 so we can not end */
- /* up accidentally freeing someone elses tcon struct */
- if (tcon)
- cifs_put_tcon(tcon);
- else if (ses)
- cifs_put_smb_ses(ses);
- else
- cifs_put_tcp_session(server, 0);
+ if (tcon) {
+ rc = is_path_remote(cifs_sb, vol, xid, server, tcon);
+ if (rc == -EREMOTE)
+ rc = -EOPNOTSUPP;
+ if (rc)
+ goto error;
}
-out:
free_xid(xid);
+
+ return mount_setup_tlink(cifs_sb, ses, tcon);
+
+error:
+ mount_put_conns(cifs_sb, xid, server, ses, tcon);
return rc;
}
+#endif
/*
* Issue a TREE_CONNECT request.
@@ -4370,6 +4860,10 @@ cifs_umount(struct cifs_sb_info *cifs_sb)
kfree(cifs_sb->mountdata);
kfree(cifs_sb->prepath);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ dfs_cache_del_vol(cifs_sb->origin_fullpath);
+ kfree(cifs_sb->origin_fullpath);
+#endif
call_rcu(&cifs_sb->rcu, delayed_free);
}
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
new file mode 100644
index 0000000..cd63c4a
--- /dev/null
+++ b/fs/cifs/dfs_cache.c
@@ -0,0 +1,1367 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DFS referral cache routines
+ *
+ * Copyright (c) 2018 Paulo Alcantara <palcantara@suse.de>
+ */
+
+#include <linux/rcupdate.h>
+#include <linux/rculist.h>
+#include <linux/jhash.h>
+#include <linux/ktime.h>
+#include <linux/slab.h>
+#include <linux/nls.h>
+#include <linux/workqueue.h>
+#include "cifsglob.h"
+#include "smb2pdu.h"
+#include "smb2proto.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "cifs_unicode.h"
+#include "smb2glob.h"
+
+#include "dfs_cache.h"
+
+#define DFS_CACHE_HTABLE_SIZE 32
+#define DFS_CACHE_MAX_ENTRIES 64
+
+#define IS_INTERLINK_SET(v) ((v) & (DFSREF_REFERRAL_SERVER | \
+ DFSREF_STORAGE_SERVER))
+
+struct dfs_cache_tgt {
+ char *t_name;
+ struct list_head t_list;
+};
+
+struct dfs_cache_entry {
+ struct hlist_node ce_hlist;
+ const char *ce_path;
+ int ce_ttl;
+ int ce_srvtype;
+ int ce_flags;
+ struct timespec64 ce_etime;
+ int ce_path_consumed;
+ int ce_numtgts;
+ struct list_head ce_tlist;
+ struct dfs_cache_tgt *ce_tgthint;
+ struct rcu_head ce_rcu;
+};
+
+static struct kmem_cache *dfs_cache_slab __read_mostly;
+
+struct dfs_cache_vol_info {
+ char *vi_fullpath;
+ struct smb_vol vi_vol;
+ struct list_head vi_list;
+};
+
+struct dfs_cache {
+ struct mutex dc_lock;
+ struct nls_table *dc_nlsc;
+ struct list_head dc_vol_list;
+ int dc_ttl;
+ struct delayed_work dc_refresh;
+};
+
+static struct dfs_cache dfs_cache;
+
+/*
+ * Number of entries in the cache
+ */
+static size_t dfs_cache_count;
+
+static DEFINE_MUTEX(dfs_cache_list_lock);
+static struct hlist_head dfs_cache_htable[DFS_CACHE_HTABLE_SIZE];
+
+static void refresh_cache_worker(struct work_struct *work);
+
+static inline bool is_path_valid(const char *path)
+{
+ return path && (strchr(path + 1, '\\') || strchr(path + 1, '/'));
+}
+
+static inline int get_normalized_path(const char *path, char **npath)
+{
+ if (*path == '\\') {
+ *npath = (char *)path;
+ } else {
+ *npath = kstrndup(path, strlen(path), GFP_KERNEL);
+ if (!*npath)
+ return -ENOMEM;
+ convert_delimiter(*npath, '\\');
+ }
+ return 0;
+}
+
+static inline void free_normalized_path(const char *path, char *npath)
+{
+ if (path != npath)
+ kfree(npath);
+}
+
+static inline bool cache_entry_expired(const struct dfs_cache_entry *ce)
+{
+ struct timespec64 ts;
+
+ ktime_get_coarse_real_ts64(&ts);
+ return timespec64_compare(&ts, &ce->ce_etime) >= 0;
+}
+
+static inline void free_tgts(struct dfs_cache_entry *ce)
+{
+ struct dfs_cache_tgt *t, *n;
+
+ list_for_each_entry_safe(t, n, &ce->ce_tlist, t_list) {
+ list_del(&t->t_list);
+ kfree(t->t_name);
+ kfree(t);
+ }
+}
+
+static void free_cache_entry(struct rcu_head *rcu)
+{
+ struct dfs_cache_entry *ce = container_of(rcu, struct dfs_cache_entry,
+ ce_rcu);
+ kmem_cache_free(dfs_cache_slab, ce);
+}
+
+static inline void flush_cache_ent(struct dfs_cache_entry *ce)
+{
+ if (hlist_unhashed(&ce->ce_hlist))
+ return;
+
+ hlist_del_init_rcu(&ce->ce_hlist);
+ kfree(ce->ce_path);
+ free_tgts(ce);
+ dfs_cache_count--;
+ call_rcu(&ce->ce_rcu, free_cache_entry);
+}
+
+static void flush_cache_ents(void)
+{
+ int i;
+
+ rcu_read_lock();
+ for (i = 0; i < DFS_CACHE_HTABLE_SIZE; i++) {
+ struct hlist_head *l = &dfs_cache_htable[i];
+ struct dfs_cache_entry *ce;
+
+ hlist_for_each_entry_rcu(ce, l, ce_hlist)
+ flush_cache_ent(ce);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * dfs cache /proc file
+ */
+static int dfscache_proc_show(struct seq_file *m, void *v)
+{
+ int bucket;
+ struct dfs_cache_entry *ce;
+ struct dfs_cache_tgt *t;
+
+ seq_puts(m, "DFS cache\n---------\n");
+
+ mutex_lock(&dfs_cache_list_lock);
+
+ rcu_read_lock();
+ hash_for_each_rcu(dfs_cache_htable, bucket, ce, ce_hlist) {
+ seq_printf(m,
+ "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,"
+ "interlink=%s,path_consumed=%d,expired=%s\n",
+ ce->ce_path,
+ ce->ce_srvtype == DFS_TYPE_ROOT ? "root" : "link",
+ ce->ce_ttl, ce->ce_etime.tv_nsec,
+ IS_INTERLINK_SET(ce->ce_flags) ? "yes" : "no",
+ ce->ce_path_consumed,
+ cache_entry_expired(ce) ? "yes" : "no");
+
+ list_for_each_entry(t, &ce->ce_tlist, t_list) {
+ seq_printf(m, " %s%s\n",
+ t->t_name,
+ ce->ce_tgthint == t ? " (target hint)" : "");
+ }
+
+ }
+ rcu_read_unlock();
+
+ mutex_unlock(&dfs_cache_list_lock);
+ return 0;
+}
+
+static ssize_t dfscache_proc_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ char c;
+ int rc;
+
+ rc = get_user(c, buffer);
+ if (rc)
+ return rc;
+
+ if (c != '0')
+ return -EINVAL;
+
+ cifs_dbg(FYI, "clearing dfs cache");
+ mutex_lock(&dfs_cache_list_lock);
+ flush_cache_ents();
+ mutex_unlock(&dfs_cache_list_lock);
+
+ return count;
+}
+
+static int dfscache_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, dfscache_proc_show, NULL);
+}
+
+const struct file_operations dfscache_proc_fops = {
+ .open = dfscache_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = dfscache_proc_write,
+};
+
+#ifdef CONFIG_CIFS_DEBUG2
+static inline void dump_tgts(const struct dfs_cache_entry *ce)
+{
+ struct dfs_cache_tgt *t;
+
+ cifs_dbg(FYI, "target list:\n");
+ list_for_each_entry(t, &ce->ce_tlist, t_list) {
+ cifs_dbg(FYI, " %s%s\n", t->t_name,
+ ce->ce_tgthint == t ? " (target hint)" : "");
+ }
+}
+
+static inline void dump_ce(const struct dfs_cache_entry *ce)
+{
+ cifs_dbg(FYI, "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,"
+ "interlink=%s,path_consumed=%d,expired=%s\n", ce->ce_path,
+ ce->ce_srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ce_ttl,
+ ce->ce_etime.tv_nsec,
+ IS_INTERLINK_SET(ce->ce_flags) ? "yes" : "no",
+ ce->ce_path_consumed,
+ cache_entry_expired(ce) ? "yes" : "no");
+ dump_tgts(ce);
+}
+
+static inline void dump_refs(const struct dfs_info3_param *refs, int numrefs)
+{
+ int i;
+
+ cifs_dbg(FYI, "DFS referrals returned by the server:\n");
+ for (i = 0; i < numrefs; i++) {
+ const struct dfs_info3_param *ref = &refs[i];
+
+ cifs_dbg(FYI,
+ "\n"
+ "flags: 0x%x\n"
+ "path_consumed: %d\n"
+ "server_type: 0x%x\n"
+ "ref_flag: 0x%x\n"
+ "path_name: %s\n"
+ "node_name: %s\n"
+ "ttl: %d (%dm)\n",
+ ref->flags, ref->path_consumed, ref->server_type,
+ ref->ref_flag, ref->path_name, ref->node_name,
+ ref->ttl, ref->ttl / 60);
+ }
+}
+#else
+#define dump_tgts(e)
+#define dump_ce(e)
+#define dump_refs(r, n)
+#endif
+
+/**
+ * dfs_cache_init - Initialize DFS referral cache.
+ *
+ * Return zero if initialized successfully, otherwise non-zero.
+ */
+int dfs_cache_init(void)
+{
+ int i;
+
+ dfs_cache_slab = kmem_cache_create("cifs_dfs_cache",
+ sizeof(struct dfs_cache_entry), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!dfs_cache_slab)
+ return -ENOMEM;
+
+ for (i = 0; i < DFS_CACHE_HTABLE_SIZE; i++)
+ INIT_HLIST_HEAD(&dfs_cache_htable[i]);
+
+ INIT_LIST_HEAD(&dfs_cache.dc_vol_list);
+ mutex_init(&dfs_cache.dc_lock);
+ INIT_DELAYED_WORK(&dfs_cache.dc_refresh, refresh_cache_worker);
+ dfs_cache.dc_ttl = -1;
+ dfs_cache.dc_nlsc = load_nls_default();
+
+ cifs_dbg(FYI, "%s: initialized DFS referral cache\n", __func__);
+ return 0;
+}
+
+static inline unsigned int cache_entry_hash(const void *data, int size)
+{
+ unsigned int h;
+
+ h = jhash(data, size, 0);
+ return h & (DFS_CACHE_HTABLE_SIZE - 1);
+}
+
+/* Check whether second path component of @path is SYSVOL or NETLOGON */
+static inline bool is_sysvol_or_netlogon(const char *path)
+{
+ const char *s;
+ char sep = path[0];
+
+ s = strchr(path + 1, sep) + 1;
+ return !strncasecmp(s, "sysvol", strlen("sysvol")) ||
+ !strncasecmp(s, "netlogon", strlen("netlogon"));
+}
+
+/* Return target hint of a DFS cache entry */
+static inline char *get_tgt_name(const struct dfs_cache_entry *ce)
+{
+ struct dfs_cache_tgt *t = ce->ce_tgthint;
+
+ return t ? t->t_name : ERR_PTR(-ENOENT);
+}
+
+/* Return expire time out of a new entry's TTL */
+static inline struct timespec64 get_expire_time(int ttl)
+{
+ struct timespec64 ts = {
+ .tv_sec = ttl,
+ .tv_nsec = 0,
+ };
+ struct timespec64 now;
+
+ ktime_get_coarse_real_ts64(&now);
+ return timespec64_add(now, ts);
+}
+
+/* Allocate a new DFS target */
+static inline struct dfs_cache_tgt *alloc_tgt(const char *name)
+{
+ struct dfs_cache_tgt *t;
+
+ t = kmalloc(sizeof(*t), GFP_KERNEL);
+ if (!t)
+ return ERR_PTR(-ENOMEM);
+ t->t_name = kstrndup(name, strlen(name), GFP_KERNEL);
+ if (!t->t_name) {
+ kfree(t);
+ return ERR_PTR(-ENOMEM);
+ }
+ INIT_LIST_HEAD(&t->t_list);
+ return t;
+}
+
+/*
+ * Copy DFS referral information to a cache entry and conditionally update
+ * target hint.
+ */
+static int copy_ref_data(const struct dfs_info3_param *refs, int numrefs,
+ struct dfs_cache_entry *ce, const char *tgthint)
+{
+ int i;
+
+ ce->ce_ttl = refs[0].ttl;
+ ce->ce_etime = get_expire_time(ce->ce_ttl);
+ ce->ce_srvtype = refs[0].server_type;
+ ce->ce_flags = refs[0].ref_flag;
+ ce->ce_path_consumed = refs[0].path_consumed;
+
+ for (i = 0; i < numrefs; i++) {
+ struct dfs_cache_tgt *t;
+
+ t = alloc_tgt(refs[i].node_name);
+ if (IS_ERR(t)) {
+ free_tgts(ce);
+ return PTR_ERR(t);
+ }
+ if (tgthint && !strcasecmp(t->t_name, tgthint)) {
+ list_add(&t->t_list, &ce->ce_tlist);
+ tgthint = NULL;
+ } else {
+ list_add_tail(&t->t_list, &ce->ce_tlist);
+ }
+ ce->ce_numtgts++;
+ }
+
+ ce->ce_tgthint = list_first_entry_or_null(&ce->ce_tlist,
+ struct dfs_cache_tgt, t_list);
+
+ return 0;
+}
+
+/* Allocate a new cache entry */
+static struct dfs_cache_entry *
+alloc_cache_entry(const char *path, const struct dfs_info3_param *refs,
+ int numrefs)
+{
+ struct dfs_cache_entry *ce;
+ int rc;
+
+ ce = kmem_cache_zalloc(dfs_cache_slab, GFP_KERNEL);
+ if (!ce)
+ return ERR_PTR(-ENOMEM);
+
+ ce->ce_path = kstrdup_const(path, GFP_KERNEL);
+ if (!ce->ce_path) {
+ kmem_cache_free(dfs_cache_slab, ce);
+ return ERR_PTR(-ENOMEM);
+ }
+ INIT_HLIST_NODE(&ce->ce_hlist);
+ INIT_LIST_HEAD(&ce->ce_tlist);
+
+ rc = copy_ref_data(refs, numrefs, ce, NULL);
+ if (rc) {
+ kfree(ce->ce_path);
+ kmem_cache_free(dfs_cache_slab, ce);
+ ce = ERR_PTR(rc);
+ }
+ return ce;
+}
+
+static void remove_oldest_entry(void)
+{
+ int bucket;
+ struct dfs_cache_entry *ce;
+ struct dfs_cache_entry *to_del = NULL;
+
+ rcu_read_lock();
+ hash_for_each_rcu(dfs_cache_htable, bucket, ce, ce_hlist) {
+ if (!to_del || timespec64_compare(&ce->ce_etime,
+ &to_del->ce_etime) < 0)
+ to_del = ce;
+ }
+ if (!to_del) {
+ cifs_dbg(FYI, "%s: no entry to remove", __func__);
+ goto out;
+ }
+ cifs_dbg(FYI, "%s: removing entry", __func__);
+ dump_ce(to_del);
+ flush_cache_ent(to_del);
+out:
+ rcu_read_unlock();
+}
+
+/* Add a new DFS cache entry */
+static inline struct dfs_cache_entry *
+add_cache_entry(unsigned int hash, const char *path,
+ const struct dfs_info3_param *refs, int numrefs)
+{
+ struct dfs_cache_entry *ce;
+
+ ce = alloc_cache_entry(path, refs, numrefs);
+ if (IS_ERR(ce))
+ return ce;
+
+ hlist_add_head_rcu(&ce->ce_hlist, &dfs_cache_htable[hash]);
+
+ mutex_lock(&dfs_cache.dc_lock);
+ if (dfs_cache.dc_ttl < 0) {
+ dfs_cache.dc_ttl = ce->ce_ttl;
+ queue_delayed_work(cifsiod_wq, &dfs_cache.dc_refresh,
+ dfs_cache.dc_ttl * HZ);
+ } else {
+ dfs_cache.dc_ttl = min_t(int, dfs_cache.dc_ttl, ce->ce_ttl);
+ mod_delayed_work(cifsiod_wq, &dfs_cache.dc_refresh,
+ dfs_cache.dc_ttl * HZ);
+ }
+ mutex_unlock(&dfs_cache.dc_lock);
+
+ return ce;
+}
+
+static struct dfs_cache_entry *__find_cache_entry(unsigned int hash,
+ const char *path)
+{
+ struct dfs_cache_entry *ce;
+ bool found = false;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(ce, &dfs_cache_htable[hash], ce_hlist) {
+ if (!strcasecmp(path, ce->ce_path)) {
+#ifdef CONFIG_CIFS_DEBUG2
+ char *name = get_tgt_name(ce);
+
+ if (unlikely(IS_ERR(name))) {
+ rcu_read_unlock();
+ return ERR_CAST(name);
+ }
+ cifs_dbg(FYI, "%s: cache hit\n", __func__);
+ cifs_dbg(FYI, "%s: target hint: %s\n", __func__, name);
+#endif
+ found = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return found ? ce : ERR_PTR(-ENOENT);
+}
+
+/*
+ * Find a DFS cache entry in hash table and optionally check prefix path against
+ * @path.
+ * Use whole path components in the match.
+ * Return ERR_PTR(-ENOENT) if the entry is not found.
+ */
+static inline struct dfs_cache_entry *find_cache_entry(const char *path,
+ unsigned int *hash)
+{
+ *hash = cache_entry_hash(path, strlen(path));
+ return __find_cache_entry(*hash, path);
+}
+
+static inline void destroy_slab_cache(void)
+{
+ rcu_barrier();
+ kmem_cache_destroy(dfs_cache_slab);
+}
+
+static inline void free_vol(struct dfs_cache_vol_info *vi)
+{
+ list_del(&vi->vi_list);
+ kfree(vi->vi_fullpath);
+ cifs_cleanup_volume_info_contents(&vi->vi_vol);
+ kfree(vi);
+}
+
+static inline void free_vol_list(void)
+{
+ struct dfs_cache_vol_info *vi, *nvi;
+
+ list_for_each_entry_safe(vi, nvi, &dfs_cache.dc_vol_list, vi_list)
+ free_vol(vi);
+}
+
+/**
+ * dfs_cache_destroy - destroy DFS referral cache
+ */
+void dfs_cache_destroy(void)
+{
+ cancel_delayed_work_sync(&dfs_cache.dc_refresh);
+ unload_nls(dfs_cache.dc_nlsc);
+ free_vol_list();
+ mutex_destroy(&dfs_cache.dc_lock);
+
+ flush_cache_ents();
+ destroy_slab_cache();
+ mutex_destroy(&dfs_cache_list_lock);
+
+ cifs_dbg(FYI, "%s: destroyed DFS referral cache\n", __func__);
+}
+
+static inline struct dfs_cache_entry *
+__update_cache_entry(const char *path, const struct dfs_info3_param *refs,
+ int numrefs)
+{
+ int rc;
+ unsigned int h;
+ struct dfs_cache_entry *ce;
+ char *s, *th = NULL;
+
+ ce = find_cache_entry(path, &h);
+ if (IS_ERR(ce))
+ return ce;
+
+ if (ce->ce_tgthint) {
+ s = ce->ce_tgthint->t_name;
+ th = kstrndup(s, strlen(s), GFP_KERNEL);
+ if (!th)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ free_tgts(ce);
+ ce->ce_numtgts = 0;
+
+ rc = copy_ref_data(refs, numrefs, ce, th);
+ kfree(th);
+
+ if (rc)
+ ce = ERR_PTR(rc);
+
+ return ce;
+}
+
+/* Update an expired cache entry by getting a new DFS referral from server */
+static struct dfs_cache_entry *
+update_cache_entry(const unsigned int xid, struct cifs_ses *ses,
+ const struct nls_table *nls_codepage, int remap,
+ const char *path, struct dfs_cache_entry *ce)
+{
+ int rc;
+ struct dfs_info3_param *refs = NULL;
+ int numrefs = 0;
+
+ cifs_dbg(FYI, "%s: update expired cache entry\n", __func__);
+ /*
+ * Check if caller provided enough parameters to update an expired
+ * entry.
+ */
+ if (!ses || !ses->server || !ses->server->ops->get_dfs_refer)
+ return ERR_PTR(-ETIME);
+ if (unlikely(!nls_codepage))
+ return ERR_PTR(-ETIME);
+
+ cifs_dbg(FYI, "%s: DFS referral request for %s\n", __func__, path);
+
+ rc = ses->server->ops->get_dfs_refer(xid, ses, path, &refs, &numrefs,
+ nls_codepage, remap);
+ if (rc)
+ ce = ERR_PTR(rc);
+ else
+ ce = __update_cache_entry(path, refs, numrefs);
+
+ dump_refs(refs, numrefs);
+ free_dfs_info_array(refs, numrefs);
+
+ return ce;
+}
+
+/*
+ * Find, create or update a DFS cache entry.
+ *
+ * If the entry wasn't found, it will create a new one. Or if it was found but
+ * expired, then it will update the entry accordingly.
+ *
+ * For interlinks, __cifs_dfs_mount() and expand_dfs_referral() are supposed to
+ * handle them properly.
+ */
+static struct dfs_cache_entry *
+do_dfs_cache_find(const unsigned int xid, struct cifs_ses *ses,
+ const struct nls_table *nls_codepage, int remap,
+ const char *path, bool noreq)
+{
+ int rc;
+ unsigned int h;
+ struct dfs_cache_entry *ce;
+ struct dfs_info3_param *nrefs;
+ int numnrefs;
+
+ cifs_dbg(FYI, "%s: search path: %s\n", __func__, path);
+
+ ce = find_cache_entry(path, &h);
+ if (IS_ERR(ce)) {
+ cifs_dbg(FYI, "%s: cache miss\n", __func__);
+ /*
+ * If @noreq is set, no requests will be sent to the server for
+ * either updating or getting a new DFS referral.
+ */
+ if (noreq)
+ return ce;
+ /*
+ * No cache entry was found, so check for valid parameters that
+ * will be required to get a new DFS referral and then create a
+ * new cache entry.
+ */
+ if (!ses || !ses->server || !ses->server->ops->get_dfs_refer) {
+ ce = ERR_PTR(-EOPNOTSUPP);
+ return ce;
+ }
+ if (unlikely(!nls_codepage)) {
+ ce = ERR_PTR(-EINVAL);
+ return ce;
+ }
+
+ nrefs = NULL;
+ numnrefs = 0;
+
+ cifs_dbg(FYI, "%s: DFS referral request for %s\n", __func__,
+ path);
+
+ rc = ses->server->ops->get_dfs_refer(xid, ses, path, &nrefs,
+ &numnrefs, nls_codepage,
+ remap);
+ if (rc) {
+ ce = ERR_PTR(rc);
+ return ce;
+ }
+
+ dump_refs(nrefs, numnrefs);
+
+ cifs_dbg(FYI, "%s: new cache entry\n", __func__);
+
+ if (dfs_cache_count >= DFS_CACHE_MAX_ENTRIES) {
+ cifs_dbg(FYI, "%s: reached max cache size (%d)",
+ __func__, DFS_CACHE_MAX_ENTRIES);
+ remove_oldest_entry();
+ }
+ ce = add_cache_entry(h, path, nrefs, numnrefs);
+ free_dfs_info_array(nrefs, numnrefs);
+
+ if (IS_ERR(ce))
+ return ce;
+
+ dfs_cache_count++;
+ }
+
+ dump_ce(ce);
+
+ /* Just return the found cache entry in case @noreq is set */
+ if (noreq)
+ return ce;
+
+ if (cache_entry_expired(ce)) {
+ cifs_dbg(FYI, "%s: expired cache entry\n", __func__);
+ ce = update_cache_entry(xid, ses, nls_codepage, remap, path,
+ ce);
+ if (IS_ERR(ce)) {
+ cifs_dbg(FYI, "%s: failed to update expired entry\n",
+ __func__);
+ }
+ }
+ return ce;
+}
+
+/* Set up a new DFS referral from a given cache entry */
+static int setup_ref(const char *path, const struct dfs_cache_entry *ce,
+ struct dfs_info3_param *ref, const char *tgt)
+{
+ int rc;
+
+ cifs_dbg(FYI, "%s: set up new ref\n", __func__);
+
+ memset(ref, 0, sizeof(*ref));
+
+ ref->path_name = kstrndup(path, strlen(path), GFP_KERNEL);
+ if (!ref->path_name)
+ return -ENOMEM;
+
+ ref->path_consumed = ce->ce_path_consumed;
+
+ ref->node_name = kstrndup(tgt, strlen(tgt), GFP_KERNEL);
+ if (!ref->node_name) {
+ rc = -ENOMEM;
+ goto err_free_path;
+ }
+
+ ref->ttl = ce->ce_ttl;
+ ref->server_type = ce->ce_srvtype;
+ ref->ref_flag = ce->ce_flags;
+
+ return 0;
+
+err_free_path:
+ kfree(ref->path_name);
+ ref->path_name = NULL;
+ return rc;
+}
+
+/* Return target list of a DFS cache entry */
+static int get_tgt_list(const struct dfs_cache_entry *ce,
+ struct dfs_cache_tgt_list *tl)
+{
+ int rc;
+ struct list_head *head = &tl->tl_list;
+ struct dfs_cache_tgt *t;
+ struct dfs_cache_tgt_iterator *it, *nit;
+
+ memset(tl, 0, sizeof(*tl));
+ INIT_LIST_HEAD(head);
+
+ list_for_each_entry(t, &ce->ce_tlist, t_list) {
+ it = kzalloc(sizeof(*it), GFP_KERNEL);
+ if (!it) {
+ rc = -ENOMEM;
+ goto err_free_it;
+ }
+
+ it->it_name = kstrndup(t->t_name, strlen(t->t_name),
+ GFP_KERNEL);
+ if (!it->it_name) {
+ rc = -ENOMEM;
+ goto err_free_it;
+ }
+
+ if (ce->ce_tgthint == t)
+ list_add(&it->it_list, head);
+ else
+ list_add_tail(&it->it_list, head);
+ }
+ tl->tl_numtgts = ce->ce_numtgts;
+
+ return 0;
+
+err_free_it:
+ list_for_each_entry_safe(it, nit, head, it_list) {
+ kfree(it->it_name);
+ kfree(it);
+ }
+ return rc;
+}
+
+/**
+ * dfs_cache_find - find a DFS cache entry
+ *
+ * If it doesn't find the cache entry, then it will get a DFS referral
+ * for @path and create a new entry.
+ *
+ * In case the cache entry exists but expired, it will get a DFS referral
+ * for @path and then update the respective cache entry.
+ *
+ * These parameters are passed down to the get_dfs_refer() call if it
+ * needs to be issued:
+ * @xid: syscall xid
+ * @ses: smb session to issue the request on
+ * @nls_codepage: charset conversion
+ * @remap: path character remapping type
+ * @path: path to lookup in DFS referral cache.
+ *
+ * @ref: when non-NULL, store single DFS referral result in it.
+ * @tgt_list: when non-NULL, store complete DFS target list in it.
+ *
+ * Return zero if the target was found, otherwise non-zero.
+ */
+int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses,
+ const struct nls_table *nls_codepage, int remap,
+ const char *path, struct dfs_info3_param *ref,
+ struct dfs_cache_tgt_list *tgt_list)
+{
+ int rc;
+ char *npath;
+ struct dfs_cache_entry *ce;
+
+ if (unlikely(!is_path_valid(path)))
+ return -EINVAL;
+
+ rc = get_normalized_path(path, &npath);
+ if (rc)
+ return rc;
+
+ mutex_lock(&dfs_cache_list_lock);
+ ce = do_dfs_cache_find(xid, ses, nls_codepage, remap, npath, false);
+ if (!IS_ERR(ce)) {
+ if (ref)
+ rc = setup_ref(path, ce, ref, get_tgt_name(ce));
+ else
+ rc = 0;
+ if (!rc && tgt_list)
+ rc = get_tgt_list(ce, tgt_list);
+ } else {
+ rc = PTR_ERR(ce);
+ }
+ mutex_unlock(&dfs_cache_list_lock);
+ free_normalized_path(path, npath);
+ return rc;
+}
+
+/**
+ * dfs_cache_noreq_find - find a DFS cache entry without sending any requests to
+ * the currently connected server.
+ *
+ * NOTE: This function will neither update a cache entry in case it was
+ * expired, nor create a new cache entry if @path hasn't been found. It heavily
+ * relies on an existing cache entry.
+ *
+ * @path: path to lookup in the DFS referral cache.
+ * @ref: when non-NULL, store single DFS referral result in it.
+ * @tgt_list: when non-NULL, store complete DFS target list in it.
+ *
+ * Return 0 if successful.
+ * Return -ENOENT if the entry was not found.
+ * Return non-zero for other errors.
+ */
+int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref,
+ struct dfs_cache_tgt_list *tgt_list)
+{
+ int rc;
+ char *npath;
+ struct dfs_cache_entry *ce;
+
+ if (unlikely(!is_path_valid(path)))
+ return -EINVAL;
+
+ rc = get_normalized_path(path, &npath);
+ if (rc)
+ return rc;
+
+ mutex_lock(&dfs_cache_list_lock);
+ ce = do_dfs_cache_find(0, NULL, NULL, 0, npath, true);
+ if (IS_ERR(ce)) {
+ rc = PTR_ERR(ce);
+ goto out;
+ }
+
+ if (ref)
+ rc = setup_ref(path, ce, ref, get_tgt_name(ce));
+ else
+ rc = 0;
+ if (!rc && tgt_list)
+ rc = get_tgt_list(ce, tgt_list);
+out:
+ mutex_unlock(&dfs_cache_list_lock);
+ free_normalized_path(path, npath);
+ return rc;
+}
+
+/**
+ * dfs_cache_update_tgthint - update target hint of a DFS cache entry
+ *
+ * If it doesn't find the cache entry, then it will get a DFS referral for @path
+ * and create a new entry.
+ *
+ * In case the cache entry exists but expired, it will get a DFS referral
+ * for @path and then update the respective cache entry.
+ *
+ * @xid: syscall id
+ * @ses: smb session
+ * @nls_codepage: charset conversion
+ * @remap: type of character remapping for paths
+ * @path: path to lookup in DFS referral cache.
+ * @it: DFS target iterator
+ *
+ * Return zero if the target hint was updated successfully, otherwise non-zero.
+ */
+int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses,
+ const struct nls_table *nls_codepage, int remap,
+ const char *path,
+ const struct dfs_cache_tgt_iterator *it)
+{
+ int rc;
+ char *npath;
+ struct dfs_cache_entry *ce;
+ struct dfs_cache_tgt *t;
+
+ if (unlikely(!is_path_valid(path)))
+ return -EINVAL;
+
+ rc = get_normalized_path(path, &npath);
+ if (rc)
+ return rc;
+
+ cifs_dbg(FYI, "%s: path: %s\n", __func__, npath);
+
+ mutex_lock(&dfs_cache_list_lock);
+ ce = do_dfs_cache_find(xid, ses, nls_codepage, remap, npath, false);
+ if (IS_ERR(ce)) {
+ rc = PTR_ERR(ce);
+ goto out;
+ }
+
+ rc = 0;
+
+ t = ce->ce_tgthint;
+
+ if (likely(!strcasecmp(it->it_name, t->t_name)))
+ goto out;
+
+ list_for_each_entry(t, &ce->ce_tlist, t_list) {
+ if (!strcasecmp(t->t_name, it->it_name)) {
+ ce->ce_tgthint = t;
+ cifs_dbg(FYI, "%s: new target hint: %s\n", __func__,
+ it->it_name);
+ break;
+ }
+ }
+
+out:
+ mutex_unlock(&dfs_cache_list_lock);
+ free_normalized_path(path, npath);
+ return rc;
+}
+
+/**
+ * dfs_cache_noreq_update_tgthint - update target hint of a DFS cache entry
+ * without sending any requests to the currently connected server.
+ *
+ * NOTE: This function will neither update a cache entry in case it was
+ * expired, nor create a new cache entry if @path hasn't been found. It heavily
+ * relies on an existing cache entry.
+ *
+ * @path: path to lookup in DFS referral cache.
+ * @it: target iterator which contains the target hint to update the cache
+ * entry with.
+ *
+ * Return zero if the target hint was updated successfully, otherwise non-zero.
+ */
+int dfs_cache_noreq_update_tgthint(const char *path,
+ const struct dfs_cache_tgt_iterator *it)
+{
+ int rc;
+ char *npath;
+ struct dfs_cache_entry *ce;
+ struct dfs_cache_tgt *t;
+
+ if (unlikely(!is_path_valid(path)) || !it)
+ return -EINVAL;
+
+ rc = get_normalized_path(path, &npath);
+ if (rc)
+ return rc;
+
+ cifs_dbg(FYI, "%s: path: %s\n", __func__, npath);
+
+ mutex_lock(&dfs_cache_list_lock);
+
+ ce = do_dfs_cache_find(0, NULL, NULL, 0, npath, true);
+ if (IS_ERR(ce)) {
+ rc = PTR_ERR(ce);
+ goto out;
+ }
+
+ rc = 0;
+
+ t = ce->ce_tgthint;
+
+ if (unlikely(!strcasecmp(it->it_name, t->t_name)))
+ goto out;
+
+ list_for_each_entry(t, &ce->ce_tlist, t_list) {
+ if (!strcasecmp(t->t_name, it->it_name)) {
+ ce->ce_tgthint = t;
+ cifs_dbg(FYI, "%s: new target hint: %s\n", __func__,
+ it->it_name);
+ break;
+ }
+ }
+
+out:
+ mutex_unlock(&dfs_cache_list_lock);
+ free_normalized_path(path, npath);
+ return rc;
+}
+
+/**
+ * dfs_cache_get_tgt_referral - returns a DFS referral (@ref) from a given
+ * target iterator (@it).
+ *
+ * @path: path to lookup in DFS referral cache.
+ * @it: DFS target iterator.
+ * @ref: DFS referral pointer to set up the gathered information.
+ *
+ * Return zero if the DFS referral was set up correctly, otherwise non-zero.
+ */
+int dfs_cache_get_tgt_referral(const char *path,
+ const struct dfs_cache_tgt_iterator *it,
+ struct dfs_info3_param *ref)
+{
+ int rc;
+ char *npath;
+ struct dfs_cache_entry *ce;
+ unsigned int h;
+
+ if (!it || !ref)
+ return -EINVAL;
+ if (unlikely(!is_path_valid(path)))
+ return -EINVAL;
+
+ rc = get_normalized_path(path, &npath);
+ if (rc)
+ return rc;
+
+ cifs_dbg(FYI, "%s: path: %s\n", __func__, npath);
+
+ mutex_lock(&dfs_cache_list_lock);
+
+ ce = find_cache_entry(npath, &h);
+ if (IS_ERR(ce)) {
+ rc = PTR_ERR(ce);
+ goto out;
+ }
+
+ cifs_dbg(FYI, "%s: target name: %s\n", __func__, it->it_name);
+
+ rc = setup_ref(path, ce, ref, it->it_name);
+
+out:
+ mutex_unlock(&dfs_cache_list_lock);
+ free_normalized_path(path, npath);
+ return rc;
+}
+
+static int dup_vol(struct smb_vol *vol, struct smb_vol *new)
+{
+ memcpy(new, vol, sizeof(*new));
+
+ if (vol->username) {
+ new->username = kstrndup(vol->username, strlen(vol->username),
+ GFP_KERNEL);
+ if (!new->username)
+ return -ENOMEM;
+ }
+ if (vol->password) {
+ new->password = kstrndup(vol->password, strlen(vol->password),
+ GFP_KERNEL);
+ if (!new->password)
+ goto err_free_username;
+ }
+ if (vol->UNC) {
+ cifs_dbg(FYI, "%s: vol->UNC: %s\n", __func__, vol->UNC);
+ new->UNC = kstrndup(vol->UNC, strlen(vol->UNC), GFP_KERNEL);
+ if (!new->UNC)
+ goto err_free_password;
+ }
+ if (vol->domainname) {
+ new->domainname = kstrndup(vol->domainname,
+ strlen(vol->domainname), GFP_KERNEL);
+ if (!new->domainname)
+ goto err_free_unc;
+ }
+ if (vol->iocharset) {
+ new->iocharset = kstrndup(vol->iocharset,
+ strlen(vol->iocharset), GFP_KERNEL);
+ if (!new->iocharset)
+ goto err_free_domainname;
+ }
+ if (vol->prepath) {
+ cifs_dbg(FYI, "%s: vol->prepath: %s\n", __func__, vol->prepath);
+ new->prepath = kstrndup(vol->prepath, strlen(vol->prepath),
+ GFP_KERNEL);
+ if (!new->prepath)
+ goto err_free_iocharset;
+ }
+
+ return 0;
+
+err_free_iocharset:
+ kfree(new->iocharset);
+err_free_domainname:
+ kfree(new->domainname);
+err_free_unc:
+ kfree(new->UNC);
+err_free_password:
+ kzfree(new->password);
+err_free_username:
+ kfree(new->username);
+ kfree(new);
+ return -ENOMEM;
+}
+
+/**
+ * dfs_cache_add_vol - add a cifs volume during mount() that will be handled by
+ * DFS cache refresh worker.
+ *
+ * @vol: cifs volume.
+ * @fullpath: origin full path.
+ *
+ * Return zero if volume was set up correctly, otherwise non-zero.
+ */
+int dfs_cache_add_vol(struct smb_vol *vol, const char *fullpath)
+{
+ int rc;
+ struct dfs_cache_vol_info *vi;
+
+ if (!vol || !fullpath)
+ return -EINVAL;
+
+ cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath);
+
+ vi = kzalloc(sizeof(*vi), GFP_KERNEL);
+ if (!vi)
+ return -ENOMEM;
+
+ vi->vi_fullpath = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL);
+ if (!vi->vi_fullpath) {
+ rc = -ENOMEM;
+ goto err_free_vi;
+ }
+
+ rc = dup_vol(vol, &vi->vi_vol);
+ if (rc)
+ goto err_free_fullpath;
+
+ mutex_lock(&dfs_cache.dc_lock);
+ list_add_tail(&vi->vi_list, &dfs_cache.dc_vol_list);
+ mutex_unlock(&dfs_cache.dc_lock);
+ return 0;
+
+err_free_fullpath:
+ kfree(vi->vi_fullpath);
+err_free_vi:
+ kfree(vi);
+ return rc;
+}
+
+static inline struct dfs_cache_vol_info *find_vol(const char *fullpath)
+{
+ struct dfs_cache_vol_info *vi;
+
+ list_for_each_entry(vi, &dfs_cache.dc_vol_list, vi_list) {
+ cifs_dbg(FYI, "%s: vi->vi_fullpath: %s\n", __func__,
+ vi->vi_fullpath);
+ if (!strcasecmp(vi->vi_fullpath, fullpath))
+ return vi;
+ }
+ return ERR_PTR(-ENOENT);
+}
+
+/**
+ * dfs_cache_update_vol - update vol info in DFS cache after failover
+ *
+ * @fullpath: fullpath to look up in volume list.
+ * @server: TCP ses pointer.
+ *
+ * Return zero if volume was updated, otherwise non-zero.
+ */
+int dfs_cache_update_vol(const char *fullpath, struct TCP_Server_Info *server)
+{
+ int rc;
+ struct dfs_cache_vol_info *vi;
+
+ if (!fullpath || !server)
+ return -EINVAL;
+
+ cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath);
+
+ mutex_lock(&dfs_cache.dc_lock);
+
+ vi = find_vol(fullpath);
+ if (IS_ERR(vi)) {
+ rc = PTR_ERR(vi);
+ goto out;
+ }
+
+ cifs_dbg(FYI, "%s: updating volume info\n", __func__);
+ memcpy(&vi->vi_vol.dstaddr, &server->dstaddr,
+ sizeof(vi->vi_vol.dstaddr));
+ rc = 0;
+
+out:
+ mutex_unlock(&dfs_cache.dc_lock);
+ return rc;
+}
+
+/**
+ * dfs_cache_del_vol - remove volume info in DFS cache during umount()
+ *
+ * @fullpath: fullpath to look up in volume list.
+ */
+void dfs_cache_del_vol(const char *fullpath)
+{
+ struct dfs_cache_vol_info *vi;
+
+ if (!fullpath || !*fullpath)
+ return;
+
+ cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath);
+
+ mutex_lock(&dfs_cache.dc_lock);
+ vi = find_vol(fullpath);
+ if (!IS_ERR(vi))
+ free_vol(vi);
+ mutex_unlock(&dfs_cache.dc_lock);
+}
+
+/* Get all tcons that are within a DFS namespace and can be refreshed */
+static void get_tcons(struct TCP_Server_Info *server, struct list_head *head)
+{
+ struct cifs_ses *ses;
+ struct cifs_tcon *tcon;
+
+ INIT_LIST_HEAD(head);
+
+ spin_lock(&cifs_tcp_ses_lock);
+ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
+ if (!tcon->need_reconnect && !tcon->need_reopen_files &&
+ tcon->dfs_path) {
+ tcon->tc_count++;
+ list_add_tail(&tcon->ulist, head);
+ }
+ }
+ if (ses->tcon_ipc && !ses->tcon_ipc->need_reconnect &&
+ ses->tcon_ipc->dfs_path) {
+ list_add_tail(&ses->tcon_ipc->ulist, head);
+ }
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
+}
+
+/* Refresh DFS cache entry from a given tcon */
+static void do_refresh_tcon(struct dfs_cache *dc, struct cifs_tcon *tcon)
+{
+ int rc = 0;
+ unsigned int xid;
+ char *path, *npath;
+ unsigned int h;
+ struct dfs_cache_entry *ce;
+ struct dfs_info3_param *refs = NULL;
+ int numrefs = 0;
+
+ xid = get_xid();
+
+ path = tcon->dfs_path + 1;
+
+ rc = get_normalized_path(path, &npath);
+ if (rc)
+ goto out;
+
+ mutex_lock(&dfs_cache_list_lock);
+ ce = find_cache_entry(npath, &h);
+ mutex_unlock(&dfs_cache_list_lock);
+
+ if (IS_ERR(ce)) {
+ rc = PTR_ERR(ce);
+ goto out;
+ }
+
+ if (!cache_entry_expired(ce))
+ goto out;
+
+ if (unlikely(!tcon->ses->server->ops->get_dfs_refer)) {
+ rc = -EOPNOTSUPP;
+ } else {
+ rc = tcon->ses->server->ops->get_dfs_refer(xid, tcon->ses, path,
+ &refs, &numrefs,
+ dc->dc_nlsc,
+ tcon->remap);
+ if (!rc) {
+ mutex_lock(&dfs_cache_list_lock);
+ ce = __update_cache_entry(npath, refs, numrefs);
+ mutex_unlock(&dfs_cache_list_lock);
+ dump_refs(refs, numrefs);
+ free_dfs_info_array(refs, numrefs);
+ if (IS_ERR(ce))
+ rc = PTR_ERR(ce);
+ }
+ }
+ if (rc)
+ cifs_dbg(FYI, "%s: failed to update expired entry\n", __func__);
+out:
+ free_xid(xid);
+ free_normalized_path(path, npath);
+}
+
+/*
+ * Worker that will refresh DFS cache based on lowest TTL value from a DFS
+ * referral.
+ *
+ * FIXME: ensure that all requests are sent to DFS root for refreshing the
+ * cache.
+ */
+static void refresh_cache_worker(struct work_struct *work)
+{
+ struct dfs_cache *dc = container_of(work, struct dfs_cache,
+ dc_refresh.work);
+ struct dfs_cache_vol_info *vi;
+ struct TCP_Server_Info *server;
+ LIST_HEAD(list);
+ struct cifs_tcon *tcon, *ntcon;
+
+ mutex_lock(&dc->dc_lock);
+
+ list_for_each_entry(vi, &dc->dc_vol_list, vi_list) {
+ server = cifs_find_tcp_session(&vi->vi_vol);
+ if (IS_ERR_OR_NULL(server))
+ continue;
+ if (server->tcpStatus != CifsGood)
+ goto next;
+ get_tcons(server, &list);
+ list_for_each_entry_safe(tcon, ntcon, &list, ulist) {
+ do_refresh_tcon(dc, tcon);
+ list_del_init(&tcon->ulist);
+ cifs_put_tcon(tcon);
+ }
+next:
+ cifs_put_tcp_session(server, 0);
+ }
+ queue_delayed_work(cifsiod_wq, &dc->dc_refresh, dc->dc_ttl * HZ);
+ mutex_unlock(&dc->dc_lock);
+}
diff --git a/fs/cifs/dfs_cache.h b/fs/cifs/dfs_cache.h
new file mode 100644
index 0000000..22f3665
--- /dev/null
+++ b/fs/cifs/dfs_cache.h
@@ -0,0 +1,97 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * DFS referral cache routines
+ *
+ * Copyright (c) 2018 Paulo Alcantara <palcantara@suse.de>
+ */
+
+#ifndef _CIFS_DFS_CACHE_H
+#define _CIFS_DFS_CACHE_H
+
+#include <linux/nls.h>
+#include <linux/list.h>
+#include "cifsglob.h"
+
+struct dfs_cache_tgt_list {
+ int tl_numtgts;
+ struct list_head tl_list;
+};
+
+struct dfs_cache_tgt_iterator {
+ char *it_name;
+ struct list_head it_list;
+};
+
+extern int dfs_cache_init(void);
+extern void dfs_cache_destroy(void);
+extern const struct file_operations dfscache_proc_fops;
+
+extern int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses,
+ const struct nls_table *nls_codepage, int remap,