aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/fid.c4
-rw-r--r--fs/9p/vfs_addr.c4
-rw-r--r--fs/9p/vfs_dentry.c8
-rw-r--r--fs/9p/vfs_dir.c4
-rw-r--r--fs/9p/vfs_file.c8
-rw-r--r--fs/9p/vfs_inode.c32
-rw-r--r--fs/9p/vfs_inode_dotl.c8
-rw-r--r--fs/Kconfig6
-rw-r--r--fs/affs/amigaffs.c2
-rw-r--r--fs/affs/file.c39
-rw-r--r--fs/affs/inode.c3
-rw-r--r--fs/affs/super.c6
-rw-r--r--fs/afs/dir.c5
-rw-r--r--fs/afs/vlocation.c1
-rw-r--r--fs/aio.c4
-rw-r--r--fs/autofs4/autofs_i.h6
-rw-r--r--fs/autofs4/dev-ioctl.c2
-rw-r--r--fs/autofs4/expire.c216
-rw-r--r--fs/autofs4/root.c62
-rw-r--r--fs/befs/btree.c53
-rw-r--r--fs/binfmt_aout.c25
-rw-r--r--fs/binfmt_elf.c25
-rw-r--r--fs/binfmt_elf_fdpic.c24
-rw-r--r--fs/binfmt_misc.c23
-rw-r--r--fs/block_dev.c7
-rw-r--r--fs/btrfs/async-thread.c11
-rw-r--r--fs/btrfs/async-thread.h1
-rw-r--r--fs/btrfs/backref.c123
-rw-r--r--fs/btrfs/backref.h3
-rw-r--r--fs/btrfs/btrfs_inode.h27
-rw-r--r--fs/btrfs/check-integrity.c18
-rw-r--r--fs/btrfs/compression.c21
-rw-r--r--fs/btrfs/ctree.c106
-rw-r--r--fs/btrfs/ctree.h93
-rw-r--r--fs/btrfs/delayed-inode.c8
-rw-r--r--fs/btrfs/dev-replace.c82
-rw-r--r--fs/btrfs/dir-item.c12
-rw-r--r--fs/btrfs/disk-io.c292
-rw-r--r--fs/btrfs/disk-io.h16
-rw-r--r--fs/btrfs/export.c4
-rw-r--r--fs/btrfs/extent-tree.c267
-rw-r--r--fs/btrfs/extent_io.c483
-rw-r--r--fs/btrfs/extent_io.h60
-rw-r--r--fs/btrfs/file-item.c30
-rw-r--r--fs/btrfs/file.c151
-rw-r--r--fs/btrfs/free-space-cache.c157
-rw-r--r--fs/btrfs/hash.c4
-rw-r--r--fs/btrfs/inode-item.c12
-rw-r--r--fs/btrfs/inode-map.c68
-rw-r--r--fs/btrfs/inode.c648
-rw-r--r--fs/btrfs/ioctl.c65
-rw-r--r--fs/btrfs/lzo.c3
-rw-r--r--fs/btrfs/orphan.c4
-rw-r--r--fs/btrfs/print-tree.c3
-rw-r--r--fs/btrfs/qgroup.c30
-rw-r--r--fs/btrfs/raid56.c8
-rw-r--r--fs/btrfs/reada.c2
-rw-r--r--fs/btrfs/relocation.c142
-rw-r--r--fs/btrfs/scrub.c67
-rw-r--r--fs/btrfs/send.c47
-rw-r--r--fs/btrfs/super.c137
-rw-r--r--fs/btrfs/sysfs.c41
-rw-r--r--fs/btrfs/sysfs.h16
-rw-r--r--fs/btrfs/tests/free-space-tests.c516
-rw-r--r--fs/btrfs/transaction.c52
-rw-r--r--fs/btrfs/transaction.h2
-rw-r--r--fs/btrfs/tree-log.c262
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/uuid-tree.c1
-rw-r--r--fs/btrfs/volumes.c676
-rw-r--r--fs/btrfs/volumes.h166
-rw-r--r--fs/btrfs/xattr.c4
-rw-r--r--fs/btrfs/zlib.c141
-rw-r--r--fs/buffer.c37
-rw-r--r--fs/cachefiles/rdwr.c48
-rw-r--r--fs/ceph/dir.c1
-rw-r--r--fs/cifs/cifs_dfs_ref.c6
-rw-r--r--fs/cifs/cifs_spnego.c1
-rw-r--r--fs/cifs/cifsacl.c1
-rw-r--r--fs/cifs/cifsfs.c7
-rw-r--r--fs/cifs/connect.c4
-rw-r--r--fs/cifs/dir.c14
-rw-r--r--fs/cifs/file.c12
-rw-r--r--fs/cifs/inode.c12
-rw-r--r--fs/cifs/readdir.c6
-rw-r--r--fs/compat.c28
-rw-r--r--fs/coredump.c8
-rw-r--r--fs/dcache.c259
-rw-r--r--fs/dlm/plock.c8
-rw-r--r--fs/ecryptfs/file.c6
-rw-r--r--fs/ecryptfs/inode.c25
-rw-r--r--fs/ecryptfs/keystore.c2
-rw-r--r--fs/ecryptfs/messaging.c3
-rw-r--r--fs/exec.c17
-rw-r--r--fs/ext2/super.c6
-rw-r--r--fs/ext3/ext3.h12
-rw-r--r--fs/ext3/super.c20
-rw-r--r--fs/ext4/super.c14
-rw-r--r--fs/f2fs/checkpoint.c97
-rw-r--r--fs/f2fs/data.c69
-rw-r--r--fs/f2fs/debug.c20
-rw-r--r--fs/f2fs/dir.c19
-rw-r--r--fs/f2fs/f2fs.h163
-rw-r--r--fs/f2fs/file.c257
-rw-r--r--fs/f2fs/gc.c26
-rw-r--r--fs/f2fs/inline.c20
-rw-r--r--fs/f2fs/inode.c37
-rw-r--r--fs/f2fs/namei.c53
-rw-r--r--fs/f2fs/node.c460
-rw-r--r--fs/f2fs/node.h60
-rw-r--r--fs/f2fs/recovery.c191
-rw-r--r--fs/f2fs/segment.c520
-rw-r--r--fs/f2fs/segment.h160
-rw-r--r--fs/f2fs/super.c47
-rw-r--r--fs/f2fs/xattr.c8
-rw-r--r--fs/fat/misc.c2
-rw-r--r--fs/fcntl.c21
-rw-r--r--fs/file.c3
-rw-r--r--fs/file_table.c14
-rw-r--r--fs/fuse/dir.c7
-rw-r--r--fs/gfs2/dentry.c3
-rw-r--r--fs/gfs2/dir.c9
-rw-r--r--fs/gfs2/dir.h1
-rw-r--r--fs/gfs2/file.c22
-rw-r--r--fs/gfs2/glock.c4
-rw-r--r--fs/gfs2/glops.c2
-rw-r--r--fs/gfs2/inode.c12
-rw-r--r--fs/gfs2/rgrp.c30
-rw-r--r--fs/gfs2/rgrp.h1
-rw-r--r--fs/gfs2/trans.c2
-rw-r--r--fs/hfs/hfs_fs.h2
-rw-r--r--fs/internal.h7
-rw-r--r--fs/isofs/inode.c2
-rw-r--r--fs/jffs2/jffs2_fs_sb.h2
-rw-r--r--fs/jffs2/wbuf.c17
-rw-r--r--fs/jfs/jfs_logmgr.c2
-rw-r--r--fs/jfs/jfs_txnmgr.c3
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/kernfs/dir.c11
-rw-r--r--fs/libfs.c18
-rw-r--r--fs/lockd/Makefile3
-rw-r--r--fs/lockd/mon.c6
-rw-r--r--fs/lockd/netns.h1
-rw-r--r--fs/lockd/procfs.c92
-rw-r--r--fs/lockd/procfs.h28
-rw-r--r--fs/lockd/svc.c16
-rw-r--r--fs/lockd/svclock.c68
-rw-r--r--fs/locks.c444
-rw-r--r--fs/mount.h25
-rw-r--r--fs/mpage.c2
-rw-r--r--fs/namei.c29
-rw-r--r--fs/namespace.c203
-rw-r--r--fs/ncpfs/dir.c9
-rw-r--r--fs/ncpfs/ncplib_kernel.h14
-rw-r--r--fs/nfs/blocklayout/Makefile3
-rw-r--r--fs/nfs/blocklayout/blocklayout.c1386
-rw-r--r--fs/nfs/blocklayout/blocklayout.h213
-rw-r--r--fs/nfs/blocklayout/blocklayoutdev.c384
-rw-r--r--fs/nfs/blocklayout/blocklayoutdm.c108
-rw-r--r--fs/nfs/blocklayout/dev.c363
-rw-r--r--fs/nfs/blocklayout/extent_tree.c602
-rw-r--r--fs/nfs/blocklayout/extents.c908
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c284
-rw-r--r--fs/nfs/callback.c4
-rw-r--r--fs/nfs/callback_proc.c23
-rw-r--r--fs/nfs/client.c6
-rw-r--r--fs/nfs/dir.c7
-rw-r--r--fs/nfs/direct.c14
-rw-r--r--fs/nfs/file.c65
-rw-r--r--fs/nfs/filelayout/filelayout.c34
-rw-r--r--fs/nfs/filelayout/filelayout.h7
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c108
-rw-r--r--fs/nfs/fscache-index.c3
-rw-r--r--fs/nfs/idmap.c2
-rw-r--r--fs/nfs/inode.c4
-rw-r--r--fs/nfs/internal.h8
-rw-r--r--fs/nfs/nfs3_fs.h34
-rw-r--r--fs/nfs/nfs3acl.c1
-rw-r--r--fs/nfs/nfs3client.c1
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs3super.c1
-rw-r--r--fs/nfs/nfs4file.c2
-rw-r--r--fs/nfs/nfs4proc.c138
-rw-r--r--fs/nfs/nfs4renewd.c12
-rw-r--r--fs/nfs/nfs4state.c18
-rw-r--r--fs/nfs/nfs4xdr.c179
-rw-r--r--fs/nfs/objlayout/objio_osd.c113
-rw-r--r--fs/nfs/objlayout/objlayout.c70
-rw-r--r--fs/nfs/objlayout/objlayout.h5
-rw-r--r--fs/nfs/pagelist.c8
-rw-r--r--fs/nfs/pnfs.c105
-rw-r--r--fs/nfs/pnfs.h50
-rw-r--r--fs/nfs/pnfs_dev.c150
-rw-r--r--fs/nfs/super.c11
-rw-r--r--fs/nfs/write.c150
-rw-r--r--fs/nfs_common/Makefile3
-rw-r--r--fs/nfs_common/grace.c (renamed from fs/lockd/grace.c)68
-rw-r--r--fs/nfsd/Kconfig4
-rw-r--r--fs/nfsd/cache.h1
-rw-r--r--fs/nfsd/export.c1
-rw-r--r--fs/nfsd/nfs3proc.c13
-rw-r--r--fs/nfsd/nfs4callback.c144
-rw-r--r--fs/nfsd/nfs4idmap.c20
-rw-r--r--fs/nfsd/nfs4proc.c49
-rw-r--r--fs/nfsd/nfs4recover.c206
-rw-r--r--fs/nfsd/nfs4state.c209
-rw-r--r--fs/nfsd/nfs4xdr.c75
-rw-r--r--fs/nfsd/nfscache.c214
-rw-r--r--fs/nfsd/nfsctl.c45
-rw-r--r--fs/nfsd/nfsd.h2
-rw-r--r--fs/nfsd/nfsfh.c6
-rw-r--r--fs/nfsd/state.h32
-rw-r--r--fs/nfsd/vfs.c37
-rw-r--r--fs/nfsd/xdr4.h14
-rw-r--r--fs/nilfs2/file.c8
-rw-r--r--fs/nilfs2/inode.c13
-rw-r--r--fs/nilfs2/ioctl.c8
-rw-r--r--fs/nilfs2/nilfs.h14
-rw-r--r--fs/nilfs2/segment.c7
-rw-r--r--fs/nilfs2/super.c6
-rw-r--r--fs/nilfs2/the_nilfs.h22
-rw-r--r--fs/notify/dnotify/dnotify.c8
-rw-r--r--fs/notify/fanotify/fanotify_user.c2
-rw-r--r--fs/notify/fsnotify.h3
-rw-r--r--fs/notify/group.c2
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c6
-rw-r--r--fs/ntfs/debug.c2
-rw-r--r--fs/ntfs/file.c5
-rw-r--r--fs/ntfs/super.c2
-rw-r--r--fs/ocfs2/aops.c15
-rw-r--r--fs/ocfs2/cluster/heartbeat.c21
-rw-r--r--fs/ocfs2/cluster/heartbeat.h1
-rw-r--r--fs/ocfs2/cluster/masklog.c6
-rw-r--r--fs/ocfs2/cluster/netdebug.c78
-rw-r--r--fs/ocfs2/cluster/tcp.c43
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c39
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c44
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c3
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c7
-rw-r--r--fs/ocfs2/dlmglue.c23
-rw-r--r--fs/ocfs2/file.c49
-rw-r--r--fs/ocfs2/inode.h2
-rw-r--r--fs/ocfs2/move_extents.c2
-rw-r--r--fs/ocfs2/quota.h5
-rw-r--r--fs/ocfs2/quota_global.c4
-rw-r--r--fs/ocfs2/quota_local.c33
-rw-r--r--fs/ocfs2/stack_user.c2
-rw-r--r--fs/ocfs2/super.c30
-rw-r--r--fs/omfs/inode.c10
-rw-r--r--fs/omfs/omfs_fs.h1
-rw-r--r--fs/proc/base.c85
-rw-r--r--fs/proc/fd.c2
-rw-r--r--fs/proc/internal.h5
-rw-r--r--fs/proc/kcore.c4
-rw-r--r--fs/proc/page.c3
-rw-r--r--fs/proc/task_mmu.c351
-rw-r--r--fs/proc/task_nommu.c88
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/read_write.c2
-rw-r--r--fs/reiserfs/journal.c8
-rw-r--r--fs/reiserfs/reiserfs.h5
-rw-r--r--fs/reiserfs/super.c16
-rw-r--r--fs/reiserfs/xattr.h1
-rw-r--r--fs/stack.c2
-rw-r--r--fs/super.c5
-rw-r--r--fs/timerfd.c3
-rw-r--r--fs/udf/file.c9
-rw-r--r--fs/udf/inode.c14
-rw-r--r--fs/udf/super.c10
-rw-r--r--fs/udf/udfdecl.h13
-rw-r--r--fs/udf/udftime.c2
-rw-r--r--fs/ufs/balloc.c3
-rw-r--r--fs/xattr.c116
-rw-r--r--fs/xfs/kmem.c1
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c4
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c365
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h7
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c3
-rw-r--r--fs/xfs/libxfs/xfs_da_format.c1
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c67
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h2
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c7
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c49
-rw-r--r--fs/xfs/libxfs/xfs_sb.c7
-rw-r--r--fs/xfs/time.h36
-rw-r--r--fs/xfs/xfs_aops.c23
-rw-r--r--fs/xfs/xfs_bmap_util.c126
-rw-r--r--fs/xfs/xfs_buf.c355
-rw-r--r--fs/xfs/xfs_buf.h15
-rw-r--r--fs/xfs/xfs_buf_item.c10
-rw-r--r--fs/xfs/xfs_file.c178
-rw-r--r--fs/xfs/xfs_fsops.c11
-rw-r--r--fs/xfs/xfs_globals.c4
-rw-r--r--fs/xfs/xfs_icache.c1
-rw-r--r--fs/xfs/xfs_inode.c34
-rw-r--r--fs/xfs/xfs_inode.h2
-rw-r--r--fs/xfs/xfs_inode_item.c2
-rw-r--r--fs/xfs/xfs_ioctl.c28
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_ioctl32.h3
-rw-r--r--fs/xfs/xfs_iomap.c4
-rw-r--r--fs/xfs/xfs_iops.c30
-rw-r--r--fs/xfs/xfs_itable.c3
-rw-r--r--fs/xfs/xfs_linux.h6
-rw-r--r--fs/xfs/xfs_log.c59
-rw-r--r--fs/xfs/xfs_log_cil.c47
-rw-r--r--fs/xfs/xfs_log_recover.c689
-rw-r--r--fs/xfs/xfs_mount.c58
-rw-r--r--fs/xfs/xfs_mru_cache.c3
-rw-r--r--fs/xfs/xfs_qm.c1
-rw-r--r--fs/xfs/xfs_rtalloc.c85
-rw-r--r--fs/xfs/xfs_rtalloc.h4
-rw-r--r--fs/xfs/xfs_super.c39
-rw-r--r--fs/xfs/xfs_symlink.c8
-rw-r--r--fs/xfs/xfs_sysctl.h5
-rw-r--r--fs/xfs/xfs_sysfs.c74
-rw-r--r--fs/xfs/xfs_sysfs.h1
-rw-r--r--fs/xfs/xfs_trace.h3
-rw-r--r--fs/xfs/xfs_trans_buf.c16
-rw-r--r--fs/xfs/xfs_trans_inode.c2
320 files changed, 11302 insertions, 8745 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index d51ec9fafcc8..47db55aee7f2 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -65,8 +65,8 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any)
{
struct p9_fid *fid, *ret;
- p9_debug(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n",
- dentry->d_name.name, dentry, from_kuid(&init_user_ns, uid),
+ p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p) uid %d any %d\n",
+ dentry, dentry, from_kuid(&init_user_ns, uid),
any);
ret = NULL;
/* we'll recheck under lock if there's anything to look in */
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index cc1cfae726b3..eb14e055ea83 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -266,8 +266,8 @@ v9fs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
* Now that we do caching with cache mode enabled, We need
* to support direct IO
*/
- p9_debug(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) off/no(%lld/%lu) EINVAL\n",
- iocb->ki_filp->f_path.dentry->d_name.name,
+ p9_debug(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%pD) off/no(%lld/%lu) EINVAL\n",
+ iocb->ki_filp,
(long long)pos, iter->nr_segs);
return -EINVAL;
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index b03dd23feda8..a345b2d659cc 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -49,8 +49,8 @@
*/
static int v9fs_cached_dentry_delete(const struct dentry *dentry)
{
- p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
- dentry->d_name.name, dentry);
+ p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n",
+ dentry, dentry);
/* Don't cache negative dentries */
if (!dentry->d_inode)
@@ -67,8 +67,8 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry)
static void v9fs_dentry_release(struct dentry *dentry)
{
struct hlist_node *p, *n;
- p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
- dentry->d_name.name, dentry);
+ p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n",
+ dentry, dentry);
hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata)
p9_client_clunk(hlist_entry(p, struct p9_fid, dlist));
dentry->d_fsdata = NULL;
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 0b3bfa303dda..4f1151088ebe 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -116,7 +116,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
int reclen = 0;
struct p9_rdir *rdir;
- p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "name %pD\n", file);
fid = file->private_data;
buflen = fid->clnt->msize - P9_IOHDRSZ;
@@ -172,7 +172,7 @@ static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx)
struct p9_rdir *rdir;
struct p9_dirent curdirent;
- p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "name %pD\n", file);
fid = file->private_data;
buflen = fid->clnt->msize - P9_READDIRHDRSZ;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 520c11c2dcca..5594505e6e73 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -301,8 +301,8 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
struct inode *inode = file_inode(filp);
int ret = -ENOLCK;
- p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n",
- filp, cmd, fl, filp->f_path.dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n",
+ filp, cmd, fl, filp);
/* No mandatory locks */
if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
@@ -337,8 +337,8 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd,
struct inode *inode = file_inode(filp);
int ret = -ENOLCK;
- p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n",
- filp, cmd, fl, filp->f_path.dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n",
+ filp, cmd, fl, filp);
/* No mandatory locks */
if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 7fa4f7a7653d..296482fc77a9 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -648,7 +648,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
struct p9_fid *dfid, *ofid, *fid;
struct inode *inode;
- p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry);
err = 0;
ofid = NULL;
@@ -755,7 +755,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
struct p9_fid *fid;
struct v9fs_session_info *v9ses;
- p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry);
err = 0;
v9ses = v9fs_inode2v9ses(dir);
perm = unixmode2p9mode(v9ses, mode | S_IFDIR);
@@ -791,8 +791,8 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
struct inode *inode;
char *name;
- p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p flags: %x\n",
- dir, dentry->d_name.name, dentry, flags);
+ p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%pd) %p flags: %x\n",
+ dir, dentry, dentry, flags);
if (dentry->d_name.len > NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
@@ -1239,7 +1239,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
struct p9_fid *fid;
struct p9_wstat *st;
- p9_debug(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, " %pd\n", dentry);
retval = -EPERM;
v9ses = v9fs_dentry2v9ses(dentry);
fid = v9fs_fid_lookup(dentry);
@@ -1262,8 +1262,8 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
retval = min(strlen(st->extension)+1, (size_t)buflen);
memcpy(buffer, st->extension, retval);
- p9_debug(P9_DEBUG_VFS, "%s -> %s (%.*s)\n",
- dentry->d_name.name, st->extension, buflen, buffer);
+ p9_debug(P9_DEBUG_VFS, "%pd -> %s (%.*s)\n",
+ dentry, st->extension, buflen, buffer);
done:
p9stat_free(st);
@@ -1283,7 +1283,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
int len = 0;
char *link = __getname();
- p9_debug(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
if (!link)
link = ERR_PTR(-ENOMEM);
@@ -1314,8 +1314,8 @@ v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
{
char *s = nd_get_link(nd);
- p9_debug(P9_DEBUG_VFS, " %s %s\n",
- dentry->d_name.name, IS_ERR(s) ? "<error>" : s);
+ p9_debug(P9_DEBUG_VFS, " %pd %s\n",
+ dentry, IS_ERR(s) ? "<error>" : s);
if (!IS_ERR(s))
__putname(s);
}
@@ -1364,8 +1364,8 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
static int
v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
{
- p9_debug(P9_DEBUG_VFS, " %lu,%s,%s\n",
- dir->i_ino, dentry->d_name.name, symname);
+ p9_debug(P9_DEBUG_VFS, " %lu,%pd,%s\n",
+ dir->i_ino, dentry, symname);
return v9fs_vfs_mkspecial(dir, dentry, P9_DMSYMLINK, symname);
}
@@ -1386,8 +1386,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
char *name;
struct p9_fid *oldfid;
- p9_debug(P9_DEBUG_VFS, " %lu,%s,%s\n",
- dir->i_ino, dentry->d_name.name, old_dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, " %lu,%pd,%pd\n",
+ dir->i_ino, dentry, old_dentry);
oldfid = v9fs_fid_clone(old_dentry);
if (IS_ERR(oldfid))
@@ -1428,8 +1428,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rde
char *name;
u32 perm;
- p9_debug(P9_DEBUG_VFS, " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n",
- dir->i_ino, dentry->d_name.name, mode,
+ p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n",
+ dir->i_ino, dentry, mode,
MAJOR(rdev), MINOR(rdev));
if (!new_valid_dev(rdev))
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 1fa85aae24df..02b64f4e576a 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -393,7 +393,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
struct dentry *dir_dentry;
struct posix_acl *dacl = NULL, *pacl = NULL;
- p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry);
err = 0;
v9ses = v9fs_inode2v9ses(dir);
@@ -767,8 +767,8 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
struct p9_fid *dfid, *oldfid;
struct v9fs_session_info *v9ses;
- p9_debug(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
- dir->i_ino, old_dentry->d_name.name, dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "dir ino: %lu, old_name: %pd, new_name: %pd\n",
+ dir->i_ino, old_dentry, dentry);
v9ses = v9fs_inode2v9ses(dir);
dir_dentry = dentry->d_parent;
@@ -917,7 +917,7 @@ v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
char *link = __getname();
char *target;
- p9_debug(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
if (!link) {
link = ERR_PTR(-ENOMEM);
diff --git a/fs/Kconfig b/fs/Kconfig
index 312393f32948..db5dc1598716 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -233,9 +233,13 @@ if NETWORK_FILESYSTEMS
source "fs/nfs/Kconfig"
source "fs/nfsd/Kconfig"
+config GRACE_PERIOD
+ tristate
+
config LOCKD
tristate
depends on FILE_LOCKING
+ select GRACE_PERIOD
config LOCKD_V4
bool
@@ -249,7 +253,7 @@ config NFS_ACL_SUPPORT
config NFS_COMMON
bool
- depends on NFSD || NFS_FS
+ depends on NFSD || NFS_FS || LOCKD
default y
source "net/sunrpc/Kconfig"
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 406b29836b19..abc853968fed 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -10,8 +10,6 @@
#include "affs.h"
-extern struct timezone sys_tz;
-
static char ErrorBuffer[256];
/*
diff --git a/fs/affs/file.c b/fs/affs/file.c
index a7fe57d2cd9a..1ed590aafecf 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -584,11 +584,14 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize)
bh->b_state &= ~(1UL << BH_New);
mark_buffer_dirty_inode(bh, inode);
if (prev_bh) {
- u32 tmp = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next);
- if (tmp)
- affs_warning(sb, "extent_file_ofs", "next block already set for %d (%d)", bidx, tmp);
+ u32 tmp_next = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next);
+
+ if (tmp_next)
+ affs_warning(sb, "extent_file_ofs",
+ "next block already set for %d (%d)",
+ bidx, tmp_next);
AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr);
- affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp);
+ affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp_next);
mark_buffer_dirty_inode(prev_bh, inode);
affs_brelse(prev_bh);
}
@@ -727,11 +730,14 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
AFFS_DATA_HEAD(bh)->next = 0;
bh->b_state &= ~(1UL << BH_New);
if (prev_bh) {
- u32 tmp = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next);
- if (tmp)
- affs_warning(sb, "commit_write_ofs", "next block already set for %d (%d)", bidx, tmp);
+ u32 tmp_next = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next);
+
+ if (tmp_next)
+ affs_warning(sb, "commit_write_ofs",
+ "next block already set for %d (%d)",
+ bidx, tmp_next);
AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr);
- affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp);
+ affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp_next);
mark_buffer_dirty_inode(prev_bh, inode);
}
}
@@ -758,11 +764,14 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
AFFS_DATA_HEAD(bh)->next = 0;
bh->b_state &= ~(1UL << BH_New);
if (prev_bh) {
- u32 tmp = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next);
- if (tmp)
- affs_warning(sb, "commit_write_ofs", "next block already set for %d (%d)", bidx, tmp);
+ u32 tmp_next = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next);
+
+ if (tmp_next)
+ affs_warning(sb, "commit_write_ofs",
+ "next block already set for %d (%d)",
+ bidx, tmp_next);
AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr);
- affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp);
+ affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp_next);
mark_buffer_dirty_inode(prev_bh, inode);
}
} else if (be32_to_cpu(AFFS_DATA_HEAD(bh)->size) < tmp)
@@ -842,12 +851,12 @@ affs_truncate(struct inode *inode)
struct address_space *mapping = inode->i_mapping;
struct page *page;
void *fsdata;
- loff_t size = inode->i_size;
+ loff_t isize = inode->i_size;
int res;
- res = mapping->a_ops->write_begin(NULL, mapping, size, 0, 0, &page, &fsdata);
+ res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, 0, &page, &fsdata);
if (!res)
- res = mapping->a_ops->write_end(NULL, mapping, size, 0, 0, page, fsdata);
+ res = mapping->a_ops->write_end(NULL, mapping, isize, 0, 0, page, fsdata);
else
inode->i_size = AFFS_I(inode)->mmu_private;
mark_inode_dirty(inode);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index bec2d1a0c91c..e217c511459b 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -14,13 +14,11 @@
#include "affs.h"
extern const struct inode_operations affs_symlink_inode_operations;
-extern struct timezone sys_tz;
struct inode *affs_iget(struct super_block *sb, unsigned long ino)
{
struct affs_sb_info *sbi = AFFS_SB(sb);
struct buffer_head *bh;
- struct affs_head *head;
struct affs_tail *tail;
struct inode *inode;
u32 block;
@@ -49,7 +47,6 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
goto bad_inode;
}
- head = AFFS_HEAD(bh);
tail = AFFS_TAIL(sb, bh);
prot = be32_to_cpu(tail->protect);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 51f1a95bff73..f754ab68a840 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -20,8 +20,6 @@
#include <linux/writeback.h>
#include "affs.h"
-extern struct timezone sys_tz;
-
static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
static int affs_remount (struct super_block *sb, int *flags, char *data);
@@ -308,7 +306,6 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
u32 chksum;
int num_bm;
int i, j;
- s32 key;
kuid_t uid;
kgid_t gid;
int reserved;
@@ -367,7 +364,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
i = j = blocksize;
size = size / (blocksize / 512);
}
- for (blocksize = i, key = 0; blocksize <= j; blocksize <<= 1, size >>= 1) {
+ for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) {
sbi->s_root_block = root_block;
if (root_block < 0)
sbi->s_root_block = (reserved + size - 1) / 2;
@@ -399,7 +396,6 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
be32_to_cpu(AFFS_ROOT_TAIL(sb, root_bh)->stype) == ST_ROOT) {
sbi->s_hashsize = blocksize / 4 - 56;
sbi->s_root_block += num_bm;
- key = 1;
goto got_root;
}
affs_brelse(root_bh);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 529300327f45..a1645b88fe8a 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -669,7 +669,6 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
out_valid:
dentry->d_fsdata = dir_version;
-out_skip:
dput(parent);
key_put(key);
_leave(" = 1 [valid]");
@@ -682,10 +681,6 @@ not_found:
spin_unlock(&dentry->d_lock);
out_bad:
- /* don't unhash if we have submounts */
- if (check_submounts_and_drop(dentry) != 0)
- goto out_skip;
-
_debug("dropping dentry %s/%s",
parent->d_name.name, dentry->d_name.name);
dput(parent);
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index b6df2e83809f..52976785a32c 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -130,7 +130,6 @@ static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl,
/* second+ BUSY - sleep a little bit */
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_timeout(1);
- __set_current_state(TASK_RUNNING);
}
continue;
}
diff --git a/fs/aio.c b/fs/aio.c
index 733750096b71..84a751005f5b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -661,10 +661,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
INIT_LIST_HEAD(&ctx->active_reqs);
- if (percpu_ref_init(&ctx->users, free_ioctx_users))
+ if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL))
goto err;
- if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs))
+ if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL))
goto err;
ctx->cpu = alloc_percpu(struct kioctx_cpu);
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 9e359fb20c0a..8e98cf954bab 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -79,6 +79,10 @@ struct autofs_info {
};
#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */
+#define AUTOFS_INF_NO_RCU (1<<1) /* the dentry is being considered
+ * for expiry, so RCU_walk is
+ * not permitted
+ */
#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */
struct autofs_wait_queue {
@@ -148,7 +152,7 @@ void autofs4_free_ino(struct autofs_info *);
/* Expiration */
int is_autofs4_dentry(struct dentry *);
-int autofs4_expire_wait(struct dentry *dentry);
+int autofs4_expire_wait(struct dentry *dentry, int rcu_walk);
int autofs4_expire_run(struct super_block *, struct vfsmount *,
struct autofs_sb_info *,
struct autofs_packet_expire __user *);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 5b570b6efa28..aaf96cb25452 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -450,7 +450,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
ino = autofs4_dentry_ino(path.dentry);
if (ino) {
err = 0;
- autofs4_expire_wait(path.dentry);
+ autofs4_expire_wait(path.dentry, 0);
spin_lock(&sbi->fs_lock);
param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid);
param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index a7be57e39be7..683a5b9ce22a 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -30,12 +30,6 @@ static inline int autofs4_can_expire(struct dentry *dentry,
/* Too young to die */
if (!timeout || time_after(ino->last_used + timeout, now))
return 0;
-
- /* update last_used here :-
- - obviously makes sense if it is in use now
- - less obviously, prevents rapid-fire expire
- attempts if expire fails the first time */
- ino->last_used = now;
}
return 1;
}
@@ -255,12 +249,6 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
struct autofs_info *ino = autofs4_dentry_ino(p);
unsigned int ino_count = atomic_read(&ino->count);
- /*
- * Clean stale dentries below that have not been
- * invalidated after a mount fail during lookup
- */
- d_invalidate(p);
-
/* allow for dget above and top is already dgot */
if (p == top)
ino_count += 2;
@@ -333,10 +321,19 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
if (ino->flags & AUTOFS_INF_PENDING)
goto out;
if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
- ino->flags |= AUTOFS_INF_EXPIRING;
- init_completion(&ino->expire_complete);
+ ino->flags |= AUTOFS_INF_NO_RCU;
spin_unlock(&sbi->fs_lock);
- return root;
+ synchronize_rcu();
+ spin_lock(&sbi->fs_lock);
+ if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
+ ino->flags |= AUTOFS_INF_EXPIRING;
+ smp_mb();
+ ino->flags &= ~AUTOFS_INF_NO_RCU;
+ init_completion(&ino->expire_complete);
+ spin_unlock(&sbi->fs_lock);
+ return root;
+ }
+ ino->flags &= ~AUTOFS_INF_NO_RCU;
}
out:
spin_unlock(&sbi->fs_lock);
@@ -345,6 +342,89 @@ out:
return NULL;
}
+/* Check if 'dentry' should expire, or return a nearby
+ * dentry that is suitable.
+ * If returned dentry is different from arg dentry,
+ * then a dget() reference was taken, else not.
+ */
+static struct dentry *should_expire(struct dentry *dentry,
+ struct vfsmount *mnt,
+ unsigned long timeout,
+ int how)
+{
+ int do_now = how & AUTOFS_EXP_IMMEDIATE;
+ int exp_leaves = how & AUTOFS_EXP_LEAVES;
+ struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ unsigned int ino_count;
+
+ /* No point expiring a pending mount */
+ if (ino->flags & AUTOFS_INF_PENDING)
+ return NULL;
+
+ /*
+ * Case 1: (i) indirect mount or top level pseudo direct mount
+ * (autofs-4.1).
+ * (ii) indirect mount with offset mount, check the "/"
+ * offset (autofs-5.0+).
+ */
+ if (d_mountpoint(dentry)) {
+ DPRINTK("checking mountpoint %p %.*s",
+ dentry, (int)dentry->d_name.len, dentry->d_name.name);
+
+ /* Can we umount this guy */
+ if (autofs4_mount_busy(mnt, dentry))
+ return NULL;
+
+ /* Can we expire this guy */
+ if (autofs4_can_expire(dentry, timeout, do_now))
+ return dentry;
+ return NULL;
+ }
+
+ if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) {
+ DPRINTK("checking symlink %p %.*s",
+ dentry, (int)dentry->d_name.len, dentry->d_name.name);
+ /*
+ * A symlink can't be "busy" in the usual sense so
+ * just check last used for expire timeout.
+ */
+ if (autofs4_can_expire(dentry, timeout, do_now))
+ return dentry;
+ return NULL;
+ }
+
+ if (simple_empty(dentry))
+ return NULL;
+
+ /* Case 2: tree mount, expire iff entire tree is not busy */
+ if (!exp_leaves) {
+ /* Path walk currently on this dentry? */
+ ino_count = atomic_read(&ino->count) + 1;
+ if (d_count(dentry) > ino_count)
+ return NULL;
+
+ if (!autofs4_tree_busy(mnt, dentry, timeout, do_now))
+ return dentry;
+ /*
+ * Case 3: pseudo direct mount, expire individual leaves
+ * (autofs-4.1).
+ */
+ } else {
+ /* Path walk currently on this dentry? */
+ struct dentry *expired;
+ ino_count = atomic_read(&ino->count) + 1;
+ if (d_count(dentry) > ino_count)
+ return NULL;
+
+ expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
+ if (expired) {
+ if (expired == dentry)
+ dput(dentry);
+ return expired;
+ }
+ }
+ return NULL;
+}
/*
* Find an eligible tree to time-out
* A tree is eligible if :-
@@ -359,11 +439,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
unsigned long timeout;
struct dentry *root = sb->s_root;
struct dentry *dentry;
- struct dentry *expired = NULL;
- int do_now = how & AUTOFS_EXP_IMMEDIATE;
- int exp_leaves = how & AUTOFS_EXP_LEAVES;
+ struct dentry *expired;
struct autofs_info *ino;
- unsigned int ino_count;
if (!root)
return NULL;
@@ -375,77 +452,28 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
while ((dentry = get_next_positive_subdir(dentry, root))) {
spin_lock(&sbi->fs_lock);
ino = autofs4_dentry_ino(dentry);
- /* No point expiring a pending mount */
- if (ino->flags & AUTOFS_INF_PENDING)
- goto next;
-
- /*
- * Case 1: (i) indirect mount or top level pseudo direct mount
- * (autofs-4.1).
- * (ii) indirect mount with offset mount, check the "/"
- * offset (autofs-5.0+).
- */
- if (d_mountpoint(dentry)) {
- DPRINTK("checking mountpoint %p %.*s",
- dentry, (int)dentry->d_name.len, dentry->d_name.name);
-
- /* Can we umount this guy */
- if (autofs4_mount_busy(mnt, dentry))
- goto next;
-
- /* Can we expire this guy */
- if (autofs4_can_expire(dentry, timeout, do_now)) {
- expired = dentry;
- goto found;
- }
- goto next;
- }
-
- if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) {
- DPRINTK("checking symlink %p %.*s",
- dentry, (int)dentry->d_name.len, dentry->d_name.name);
- /*
- * A symlink can't be "busy" in the usual sense so
- * just check last used for expire timeout.
- */
- if (autofs4_can_expire(dentry, timeout, do_now)) {
- expired = dentry;
- goto found;
- }
- goto next;
+ if (ino->flags & AUTOFS_INF_NO_RCU)
+ expired = NULL;
+ else
+ expired = should_expire(dentry, mnt, timeout, how);
+ if (!expired) {
+ spin_unlock(&sbi->fs_lock);
+ continue;
}
-
- if (simple_empty(dentry))
- goto next;
-
- /* Case 2: tree mount, expire iff entire tree is not busy */
- if (!exp_leaves) {
- /* Path walk currently on this dentry? */
- ino_count = atomic_read(&ino->count) + 1;
- if (d_count(dentry) > ino_count)
- goto next;
-
- if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
- expired = dentry;
- goto found;
- }
- /*
- * Case 3: pseudo direct mount, expire individual leaves
- * (autofs-4.1).
- */
- } else {
- /* Path walk currently on this dentry? */
- ino_count = atomic_read(&ino->count) + 1;
- if (d_count(dentry) > ino_count)
- goto next;
-
- expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
- if (expired) {
+ ino = autofs4_dentry_ino(expired);
+ ino->flags |= AUTOFS_INF_NO_RCU;
+ spin_unlock(&sbi->fs_lock);
+ synchronize_rcu();
+ spin_lock(&sbi->fs_lock);
+ if (should_expire(expired, mnt, timeout, how)) {
+ if (expired != dentry)
dput(dentry);
- goto found;
- }
+ goto found;
}
-next:
+
+ ino->flags &= ~AUTOFS_INF_NO_RCU;
+ if (expired != dentry)
+ dput(expired);
spin_unlock(&sbi->fs_lock);
}
return NULL;
@@ -453,8 +481,9 @@ next:
found:
DPRINTK("returning %p %.*s",
expired, (int)expired->d_name.len, expired->d_name.name);
- ino = autofs4_dentry_ino(expired);
ino->flags |= AUTOFS_INF_EXPIRING;
+ smp_mb();
+ ino->flags &= ~AUTOFS_INF_NO_RCU;
init_completion(&ino->expire_complete);
spin_unlock(&sbi->fs_lock);
spin_lock(&sbi->lookup_lock);
@@ -467,13 +496,18 @@ found:
return expired;
}
-int autofs4_expire_wait(struct dentry *dentry)
+int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
int status;
/* Block on any pending expire */
+ if (!(ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU)))
+ return 0;
+ if (rcu_walk)
+ return -ECHILD;
+
spin_lock(&sbi->fs_lock);
if (ino->flags & AUTOFS_INF_EXPIRING) {
spin_unlock(&sbi->fs_lock);
@@ -525,6 +559,8 @@ int autofs4_expire_run(struct super_block *sb,
spin_lock(&sbi->fs_lock);
ino = autofs4_dentry_ino(dentry);
+ /* avoid rapid-fire expire attempts if expiry fails */
+ ino->last_used = now;
ino->flags &= ~AUTOFS_INF_EXPIRING;
complete_all(&ino->expire_complete);
spin_unlock(&sbi->fs_lock);
@@ -551,6 +587,8 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
spin_lock(&sbi->fs_lock);
+ /* avoid rapid-fire expire attempts if expiry fails */
+ ino->last_used = now;
ino->flags &= ~AUTOFS_INF_EXPIRING;
complete_all(&ino->expire_complete);
spin_unlock(&sbi->fs_lock);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index cdb25ebccc4c..d76d083f2f06 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -210,7 +210,8 @@ next:
return NULL;
}
-static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
+static struct dentry *autofs4_lookup_expiring(struct dentry *dentry,
+ bool rcu_walk)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct dentry *parent = dentry->d_parent;
@@ -229,6 +230,11 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
struct dentry *expiring;
struct qstr *qstr;
+ if (rcu_walk) {
+ spin_unlock(&sbi->lookup_lock);
+ return ERR_PTR(-ECHILD);
+ }
+
ino = list_entry(p, struct autofs_info, expiring);
expiring = ino->dentry;
@@ -264,13 +270,15 @@ next:
return NULL;
}
-static int autofs4_mount_wait(struct dentry *dentry)
+static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
int status = 0;
if (ino->flags & AUTOFS_INF_PENDING) {
+ if (rcu_walk)
+ return -ECHILD;
DPRINTK("waiting for mount name=%.*s",
dentry->d_name.len, dentry->d_name.name);
status = autofs4_wait(sbi, dentry, NFY_MOUNT);
@@ -280,20 +288,22 @@ static int autofs4_mount_wait(struct dentry *dentry)
return status;
}
-static int do_expire_wait(struct dentry *dentry)
+static int do_expire_wait(struct dentry *dentry, bool rcu_walk)
{
struct dentry *expiring;
- expiring = autofs4_lookup_expiring(dentry);
+ expiring = autofs4_lookup_expiring(dentry, rcu_walk);
+ if (IS_ERR(expiring))
+ return PTR_ERR(expiring);
if (!expiring)
- return autofs4_expire_wait(dentry);
+ return autofs4_expire_wait(dentry, rcu_walk);
else {
/*
* If we are racing with expire the request might not
* be quite complete, but the directory has been removed
* so it must have been successful, just wait for it.
*/
- autofs4_expire_wait(expiring);
+ autofs4_expire_wait(expiring, 0);
autofs4_del_expiring(expiring);
dput(expiring);
}
@@ -345,7 +355,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
* and the directory was removed, so just go ahead and try
* the mount.
*/
- status = do_expire_wait(dentry);
+ status = do_expire_wait(dentry, 0);
if (status && status != -EAGAIN)
return NULL;
@@ -353,7 +363,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
spin_lock(&sbi->fs_lock);
if (ino->flags & AUTOFS_INF_PENDING) {
spin_unlock(&sbi->fs_lock);
- status = autofs4_mount_wait(dentry);
+ status = autofs4_mount_wait(dentry, 0);
if (status)
return ERR_PTR(status);
goto done;
@@ -394,7 +404,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
}
ino->flags |= AUTOFS_INF_PENDING;
spin_unlock(&sbi->fs_lock);
- status = autofs4_mount_wait(dentry);
+ status = autofs4_mount_wait(dentry, 0);
spin_lock(&sbi->fs_lock);
ino->flags &= ~AUTOFS_INF_PENDING;
if (status) {
@@ -423,28 +433,46 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
/* The daemon never waits. */
if (autofs4_oz_mode(sbi)) {
- if (rcu_walk)
- return 0;
if (!d_mountpoint(dentry))
return -EISDIR;
return 0;
}
- /* We need to sleep, so we need pathwalk to be in ref-mode */
- if (rcu_walk)
- return -ECHILD;
-
/* Wait for pending expires */
- do_expire_wait(dentry);
+ if (do_expire_wait(dentry, rcu_walk) == -ECHILD)
+ return -ECHILD;
/*
* This dentry may be under construction so wait on mount
* completion.
*/
- status = autofs4_mount_wait(dentry);
+ status = autofs4_mount_wait(dentry, rcu_walk);
if (status)
return status;
+ if (rcu_walk) {
+ /* We don't need fs_lock in rcu_walk mode,
+ * just testing 'AUTOFS_INFO_NO_RCU' is enough.
+ * simple_empty() takes a spinlock, so leave it
+ * to last.
+ * We only return -EISDIR when certain this isn't
+ * a mount-trap.
+ */
+ struct inode *inode;
+ if (ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU))
+ return 0;
+ if (d_mountpoint(dentry))
+ return 0;
+ inode = ACCESS_ONCE(dentry->d_inode);
+ if (inode && S_ISLNK(inode->i_mode))
+ return -EISDIR;
+ if (list_empty(&dentry->d_subdirs))
+ return 0;
+ if (!simple_empty(dentry))
+ return -EISDIR;
+ return 0;
+ }
+
spin_lock(&sbi->fs_lock);
/*
* If the dentry has been selected for expire while we slept
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index 9c7faa8a9288..0826e91dacda 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -78,11 +78,11 @@
/*
* In memory structure of each btree node
*/
-typedef struct {
+struct befs_btree_node {
befs_host_btree_nodehead head; /* head of node converted to cpu byteorder */
struct buffer_head *bh;
befs_btree_nodehead *od_node; /* on disk node */
-} befs_btree_node;
+};
/* local constants */
static const befs_off_t befs_bt_inval = 0xffffffffffffffffULL;
@@ -90,27 +90,30 @@ static const befs_off_t befs_bt_inval = 0xffffffffffffffffULL;
/* local functions */
static int befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
befs_btree_super * bt_super,
- befs_btree_node * this_node,
+ struct befs_btree_node *this_node,
befs_off_t * node_off);
static int befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
befs_btree_super * sup);
static int befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
- befs_btree_node * node, befs_off_t node_off);
+ struct befs_btree_node *node,
+ befs_off_t node_off);
-static int befs_leafnode(befs_btree_node * node);
+static int befs_leafnode(struct befs_btree_node *node);
-static fs16 *befs_bt_keylen_index(befs_btree_node * node);
+static fs16 *befs_bt_keylen_index(struct befs_btree_node *node);
-static fs64 *befs_bt_valarray(befs_btree_node * node);
+static fs64 *befs_bt_valarray(struct befs_btree_node *node);
-static char *befs_bt_keydata(befs_btree_node * node);
+static char *befs_bt_keydata(struct befs_btree_node *node);
-static int befs_find_key(struct super_block *sb, befs_btree_node * node,
+static int befs_find_key(struct super_block *sb,
+ struct befs_btree_node *node,
const char *findkey, befs_off_t * value);
-static char *befs_bt_get_key(struct super_block *sb, befs_btree_node * node,
+static char *befs_bt_get_key(struct super_block *sb,
+ struct befs_btree_node *node,
int index, u16 * keylen);
static int befs_compare_strings(const void *key1, int keylen1,
@@ -191,7 +194,7 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
static int
befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
- befs_btree_node * node, befs_off_t node_off)
+ struct befs_btree_node *node, befs_off_t node_off)
{
uint off = 0;
@@ -247,7 +250,7 @@ int
befs_btree_find(struct super_block *sb, befs_data_stream * ds,
const char *key, befs_off_t * value)
{
- befs_btree_node *this_node = NULL;
+ struct befs_btree_node *this_node = NULL;
befs_btree_super bt_super;
befs_off_t node_off;
int res;
@@ -260,11 +263,11 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
goto error;
}
- this_node = kmalloc(sizeof (befs_btree_node),
+ this_node = kmalloc(sizeof(struct befs_btree_node),
GFP_NOFS);
if (!this_node) {
befs_error(sb, "befs_btree_find() failed to allocate %zu "
- "bytes of memory", sizeof (befs_btree_node));
+ "bytes of memory", sizeof(struct befs_btree_node));
goto error;
}
@@ -333,7 +336,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
* Use binary search instead of a linear.
*/
static int
-befs_find_key(struct super_block *sb, befs_btree_node * node,
+befs_find_key(struct super_block *sb, struct befs_btree_node *node,
const char *findkey, befs_off_t * value)
{
int first, last, mid;
@@ -417,7 +420,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
loff_t key_no, size_t bufsize, char *keybuf, size_t * keysize,
befs_off_t * value)
{
- befs_btree_node *this_node;
+ struct befs_btree_node *this_node;
befs_btree_super bt_super;
befs_off_t node_off = 0;
int cur_key;
@@ -436,9 +439,10 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
goto error;
}
- if ((this_node = kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) {
+ this_node = kmalloc(sizeof(struct befs_btree_node), GFP_NOFS);
+ if (this_node == NULL) {
befs_error(sb, "befs_btree_read() failed to allocate %zu "
- "bytes of memory", sizeof (befs_btree_node));
+ "bytes of memory", sizeof(struct befs_btree_node));
goto error;
}
@@ -545,7 +549,8 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
*/
static int
befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
- befs_btree_super * bt_super, befs_btree_node * this_node,
+ befs_btree_super *bt_super,
+ struct befs_btree_node *this_node,
befs_off_t * node_off)
{
@@ -600,7 +605,7 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
* Return 1 if leaf, 0 if interior
*/
static int
-befs_leafnode(befs_btree_node * node)
+befs_leafnode(struct befs_btree_node *node)
{
/* all interior nodes (and only interior nodes) have an overflow node */
if (node->head.overflow == befs_bt_inval)
@@ -623,7 +628,7 @@ befs_leafnode(befs_btree_node * node)
* Except that rounding up to 8 works, and rounding up to 4 doesn't.
*/
static fs16 *
-befs_bt_keylen_index(befs_btree_node * node)
+befs_bt_keylen_index(struct befs_btree_node *node)
{
const int keylen_align = 8;
unsigned long int off =
@@ -644,7 +649,7 @@ befs_bt_keylen_index(befs_btree_node * node)
* of the node pointed to by the node header
*/
static fs64 *
-befs_bt_valarray(befs_btree_node * node)
+befs_bt_valarray(struct befs_btree_node *node)
{
void *keylen_index_start = (void *) befs_bt_keylen_index(node);
size_t keylen_index_size = node->head.all_key_count * sizeof (fs16);
@@ -660,7 +665,7 @@ befs_bt_valarray(befs_btree_node * node)
* of the node pointed to by the node header
*/
static char *
-befs_bt_keydata(befs_btree_node * node)
+befs_bt_keydata(struct befs_btree_node *node)
{
return (char *) ((void *) node->od_node + sizeof (befs_btree_nodehead));
}
@@ -676,7 +681,7 @@ befs_bt_keydata(befs_btree_node * node)
* Returns NULL on failure (bad input) and sets *@keylen = 0
*/
static char *
-befs_bt_get_key(struct super_block *sb, befs_btree_node * node,
+befs_bt_get_key(struct super_block *sb, struct befs_btree_node *node,
int index, u16 * keylen)
{
int prev_key_end;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index ca0ba15a7306..929dec08c348 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -256,11 +256,8 @@ static int load_aout_binary(struct linux_binprm * bprm)
(current->mm->start_brk = N_BSSADDR(ex));
retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
- if (retval < 0) {
- /* Someone check-me: is this error path enough? */
- send_sig(SIGKILL, current, 0);
+ if (retval < 0)
return retval;
- }
install_exec_creds(bprm);
@@ -278,17 +275,13 @@ static int load_aout_binary(struct linux_binprm * bprm)
map_size = ex.a_text+ex.a_data;
#endif
error = vm_brk(text_addr & PAGE_MASK, map_size);
- if (error != (text_addr & PAGE_MASK)) {
- send_sig(SIGKILL, current, 0);
+ if (error != (text_addr & PAGE_MASK))
return error;
- }
error = read_code(bprm->file, text_addr, pos,
ex.a_text+ex.a_data);
- if ((signed long)error < 0) {
- send_sig(SIGKILL, current, 0);
+ if ((signed long)error < 0)
return error;
- }
} else {
if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
(N_MAGIC(ex) != NMAGIC) && printk_ratelimit())
@@ -315,28 +308,22 @@ static int load_aout_binary(struct linux_binprm * bprm)
MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE,
fd_offset);
- if (error != N_TXTADDR(ex)) {
- send_sig(SIGKILL, current, 0);
+ if (error != N_TXTADDR(ex))
return error;
- }
error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE,
fd_offset + ex.a_text);
- if (error != N_DATADDR(ex)) {
- send_sig(SIGKILL, current, 0);
+ if (error != N_DATADDR(ex))
return error;
- }
}
beyond_if:
set_binfmt(&aout_format);
retval = set_brk(current->mm->start_brk, current->mm->brk);
- if (retval < 0) {
- send_sig(SIGKILL, current, 0);
+ if (retval < 0)
return retval;
- }
current->mm->start_stack =
(unsigned long) create_aout_tables((char __user *) bprm->p, bprm);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 3892c1a23241..d8fc0605b9d2 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -738,10 +738,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
change some of these later */
retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
executable_stack);
- if (retval < 0) {
- send_sig(SIGKILL, current, 0);
+ if (retval < 0)
goto out_free_dentry;
- }
current->mm->start_stack = bprm->p;
@@ -763,10 +761,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
and clear the area. */
retval = set_brk(elf_bss + load_bias,
elf_brk + load_bias);
- if (retval) {
- send_sig(SIGKILL, current, 0);
+ if (retval)
goto out_free_dentry;
- }
nbyte = ELF_PAGEOFFSET(elf_bss);
if (nbyte) {
nbyte = ELF_MIN_ALIGN - nbyte;
@@ -820,7 +816,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
elf_prot, elf_flags, 0);
if (BAD_ADDR(error)) {
- send_sig(SIGKILL, current, 0);
retval = IS_ERR((void *)error) ?
PTR_ERR((void*)error) : -EINVAL;
goto out_free_dentry;
@@ -851,7 +846,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
elf_ppnt->p_memsz > TASK_SIZE ||
TASK_SIZE - elf_ppnt->p_memsz < k) {
/* set_brk can never work. Avoid overflows. */
- send_sig(SIGKILL, current, 0);
retval = -EINVAL;
goto out_free_dentry;
}
@@ -883,12 +877,9 @@ static int load_elf_binary(struct linux_binprm *bprm)
* up getting placed where the bss needs to go.
*/
retval = set_brk(elf_bss, elf_brk);
- if (retval) {
- send_sig(SIGKILL, current, 0);
+ if (retval)
goto out_free_dentry;
- }
if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) {
- send_sig(SIGSEGV, current, 0);
retval = -EFAULT; /* Nobody gets to see this, but.. */
goto out_free_dentry;
}
@@ -909,7 +900,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
elf_entry += loc->interp_elf_ex.e_entry;
}
if (BAD_ADDR(elf_entry)) {
- force_sig(SIGSEGV, current);
retval = IS_ERR((void *)elf_entry) ?
(int)elf_entry : -EINVAL;
goto out_free_dentry;
@@ -922,7 +912,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
} else {
elf_entry = loc->elf_ex.e_entry;
if (BAD_ADDR(elf_entry)) {
- force_sig(SIGSEGV, current);
retval = -EINVAL;
goto out_free_dentry;
}
@@ -934,19 +923,15 @@ static int load_elf_binary(struct linux_binprm *bprm)
#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
retval = arch_setup_additional_pages(bprm, !!elf_interpreter);
- if (retval < 0) {
- send_sig(SIGKILL, current, 0);
+ if (retval < 0)
goto out;
- }
#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
install_exec_creds(bprm);
retval = create_elf_tables(bprm, &loc->elf_ex,
load_addr, interp_load_addr);
- if (retval < 0) {
- send_sig(SIGKILL, current, 0);
+ if (retval < 0)
goto out;
- }
/* N.B. passed_fileno might not be initialized? */
current->mm->end_code = end_code;
current->mm->start_code = start_code;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index fe2a643ee005..d3634bfb7fe1 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -317,8 +317,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
goto error;
/* there's now no turning back... the old userspace image is dead,
- * defunct, deceased, etc. after this point we have to exit via
- * error_kill */
+ * defunct, deceased, etc.
+ */
set_personality(PER_LINUX_FDPIC);
if (elf_read_implies_exec(&exec_params.hdr, executable_stack))
current->personality |= READ_IMPLIES_EXEC;
@@ -343,24 +343,22 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
retval = setup_arg_pages(bprm, current->mm->start_stack,
executable_stack);
- if (retval < 0) {
- send_sig(SIGKILL, current, 0);
- goto error_kill;
- }
+ if (retval < 0)
+ goto error;
#endif
/* load the executable and interpreter into memory */
retval = elf_fdpic_map_file(&exec_params, bprm->file, current->mm,
"executable");
if (retval < 0)
- goto error_kill;
+ goto error;
if (interpreter_name) {
retval = elf_fdpic_map_file(&interp_params, interpreter,
current->mm, "interpreter");
if (retval < 0) {
printk(KERN_ERR "Unable to load interpreter\n");
- goto error_kill;
+ goto error;
}
allow_write_access(interpreter);
@@ -397,7 +395,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
if (IS_ERR_VALUE(current->mm->start_brk)) {
retval = current->mm->start_brk;
current->mm->start_brk = 0;
- goto error_kill;
+ goto error;
}
current->mm->brk = current->mm->start_brk;
@@ -410,7 +408,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
install_exec_creds(bprm);
if (create_elf_fdpic_tables(bprm, current->mm,
&exec_params, &interp_params) < 0)
- goto error_kill;
+ goto error;
kdebug("- start_code %lx", current->mm->start_code);
kdebug("- end_code %lx", current->mm->end_code);
@@ -449,12 +447,6 @@ error:
kfree(interp_params.phdrs);
kfree(interp_params.loadmap);
return retval;
-
- /* unrecoverable error - kill the process */
-error_kill:
- send_sig(SIGSEGV, current, 0);
- goto error;
-
}
/*****************************************************************************/
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index b60500300dd7..fd8beb9657a2 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -62,7 +62,22 @@ static struct file_system_type bm_fs_type;
static struct vfsmount *bm_mnt;
static int entry_count;
-/*
+/*
+ * Max length of the register string. Determined by:
+ * - 7 delimiters
+ * - name: ~50 bytes
+ * - type: 1 byte
+ * - offset: 3 bytes (has to be smaller than BINPRM_BUF_SIZE)
+ * - magic: 128 bytes (512 in escaped form)
+ * - mask: 128 bytes (512 in escaped form)
+ * - interp: ~50 bytes
+ * - flags: 5 bytes
+ * Round that up a bit, and then back off to hold the internal data
+ * (like struct Node).
+ */
+#define MAX_REGISTER_LENGTH 1920
+
+/*
* Check if we support the binfmt
* if we do, return the node, else NULL
* locking is done in load_misc_binary
@@ -279,7 +294,7 @@ static Node *create_entry(const char __user *buffer, size_t count)
/* some sanity checks */
err = -EINVAL;
- if ((count < 11) || (count > 256))
+ if ((count < 11) || (count > MAX_REGISTER_LENGTH))
goto out;
err = -ENOMEM;
@@ -396,12 +411,12 @@ static int parse_command(const char __user *buffer, size_t count)
{
char s[4];
- if (!count)
- return 0;
if (count > 3)
return -EINVAL;
if (copy_from_user(s, buffer, count))
return -EFAULT;
+ if (!count)
+ return 0;
if (s[count-1] == '\n')
count--;
if (count == 1 && s[0] == '0')
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6d7274619bf9..e2f3ad0879ce 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -304,6 +304,12 @@ static int blkdev_readpage(struct file * file, struct page * page)
return block_read_full_page(page, blkdev_get_block);
}
+static int blkdev_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block);
+}
+
static int blkdev_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -1622,6 +1628,7 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
static const struct address_space_operations def_blk_aops = {
.readpage = blkdev_readpage,
+ .readpages = blkdev_readpages,
.writepage = blkdev_writepage,
.write_begin = blkdev_write_begin,
.write_end = blkdev_write_end,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index fbd76ded9a34..4dabeb893b7c 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -74,6 +74,7 @@ BTRFS_WORK_HELPER(endio_helper);
BTRFS_WORK_HELPER(endio_meta_helper);
BTRFS_WORK_HELPER(endio_meta_write_helper);
BTRFS_WORK_HELPER(endio_raid56_helper);
+BTRFS_WORK_HELPER(endio_repair_helper);
BTRFS_WORK_HELPER(rmw_helper);
BTRFS_WORK_HELPER(endio_write_helper);
BTRFS_WORK_HELPER(freespace_write_helper);
@@ -91,7 +92,7 @@ __btrfs_alloc_workqueue(const char *name, int flags, int max_active,
{
struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
- if (unlikely(!ret))
+ if (!ret)
return NULL;
ret->max_active = max_active;
@@ -115,7 +116,7 @@ __btrfs_alloc_workqueue(const char *name, int flags, int max_active,
ret->normal_wq = alloc_workqueue("%s-%s", flags,
ret->max_active, "btrfs",
name);
- if (unlikely(!ret->normal_wq)) {
+ if (!ret->normal_wq) {
kfree(ret);
return NULL;
}
@@ -137,12 +138,12 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
{
struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
- if (unlikely(!ret))
+ if (!ret)
return NULL;
ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
max_active, thresh);
- if (unlikely(!ret->normal)) {
+ if (!ret->normal) {
kfree(ret);
return NULL;
}
@@ -150,7 +151,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
if (flags & WQ_HIGHPRI) {
ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
thresh);
- if (unlikely(!ret->high)) {
+ if (!ret->high) {
__btrfs_destroy_workqueue(ret->normal);
kfree(ret);
return NULL;
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index e9e31c94758f..e386c29ef1f6 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -53,6 +53,7 @@ BTRFS_WORK_HELPER_PROTO(endio_helper);
BTRFS_WORK_HELPER_PROTO(endio_meta_helper);
BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper);
BTRFS_WORK_HELPER_PROTO(endio_raid56_helper);
+BTRFS_WORK_HELPER_PROTO(endio_repair_helper);
BTRFS_WORK_HELPER_PROTO(rmw_helper);
BTRFS_WORK_HELPER_PROTO(endio_write_helper);
BTRFS_WORK_HELPER_PROTO(freespace_write_helper);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 54a201dac7f9..2d3e32ebfd15 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -25,6 +25,9 @@
#include "delayed-ref.h"
#include "locking.h"
+/* Just an arbitrary number so we can be sure this happened */
+#define BACKREF_FOUND_SHARED 6
+
struct extent_inode_elem {
u64 inum;
u64 offset;
@@ -377,7 +380,8 @@ out:
static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 time_seq,
struct list_head *head,
- const u64 *extent_item_pos, u64 total_refs)
+ const u64 *extent_item_pos, u64 total_refs,
+ u64 root_objectid)
{
int err;
int ret = 0;
@@ -402,6 +406,10 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
continue;
if (ref->count == 0)
continue;
+ if (root_objectid && ref->root_id != root_objectid) {
+ ret = BACKREF_FOUND_SHARED;
+ goto out;
+ }
err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
parents, extent_item_pos,
total_refs);
@@ -482,7 +490,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
continue;
BUG_ON(!ref->wanted_disk_byte);
eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
- fs_info->tree_root->leafsize, 0);
+ 0);
if (!eb || !extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
@@ -561,7 +569,8 @@ static void __merge_refs(struct list_head *head, int mode)
* smaller or equal that seq to the list
*/
static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
- struct list_head *prefs, u64 *total_refs)
+ struct list_head *prefs, u64 *total_refs,
+ u64 inum)
{
struct btrfs_delayed_extent_op *extent_op = head->extent_op;
struct rb_node *n = &head->node.rb_node;
@@ -625,6 +634,16 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
key.objectid = ref->objectid;
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = ref->offset;
+
+ /*
+ * Found a inum that doesn't match our known inum, we
+ * know it's shared.
+ */
+ if (inum && ref->objectid != inum) {
+ ret = BACKREF_FOUND_SHARED;
+ break;
+ }
+
ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
node->bytenr,
node->ref_mod * sgn, GFP_ATOMIC);
@@ -659,7 +678,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
static int __add_inline_refs(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 bytenr,
int *info_level, struct list_head *prefs,
- u64 *total_refs)
+ u64 *total_refs, u64 inum)
{
int ret = 0;
int slot;
@@ -744,6 +763,12 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
dref);
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+
+ if (inum && key.objectid != inum) {
+ ret = BACKREF_FOUND_SHARED;
+ break;
+ }
+
root = btrfs_extent_data_ref_root(leaf, dref);
ret = __add_prelim_ref(prefs, root, &key, 0, 0,
bytenr, count, GFP_NOFS);
@@ -765,7 +790,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
*/
static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 bytenr,
- int info_level, struct list_head *prefs)
+ int info_level, struct list_head *prefs, u64 inum)
{
struct btrfs_root *extent_root = fs_info->extent_root;
int ret;
@@ -827,6 +852,12 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
dref);
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+
+ if (inum && key.objectid != inum) {
+ ret = BACKREF_FOUND_SHARED;
+ break;
+ }
+
root = btrfs_extent_data_ref_root(leaf, dref);
ret = __add_prelim_ref(prefs, root, &key, 0, 0,
bytenr, count, GFP_NOFS);
@@ -854,7 +885,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
static int find_parent_nodes(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 time_seq, struct ulist *refs,
- struct ulist *roots, const u64 *extent_item_pos)
+ struct ulist *roots, const u64 *extent_item_pos,
+ u64 root_objectid, u64 inum)
{
struct btrfs_key key;
struct btrfs_path *path;
@@ -929,7 +961,8 @@ again:
}
spin_unlock(&delayed_refs->lock);
ret = __add_delayed_refs(head, time_seq,
- &prefs_delayed, &total_refs);
+ &prefs_delayed, &total_refs,
+ inum);
mutex_unlock(&head->mutex);
if (ret)
goto out;
@@ -951,11 +984,11 @@ again:
key.type == BTRFS_METADATA_ITEM_KEY)) {
ret = __add_inline_refs(fs_info, path, bytenr,
&info_level, &prefs,
- &total_refs);
+ &total_refs, inum);
if (ret)
goto out;
ret = __add_keyed_refs(fs_info, path, bytenr,
- info_level, &prefs);
+ info_level, &prefs, inum);
if (ret)
goto out;
}
@@ -971,7 +1004,8 @@ again:
__merge_refs(&prefs, 1);
ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
- extent_item_pos, total_refs);
+ extent_item_pos, total_refs,
+ root_objectid);
if (ret)
goto out;
@@ -981,6 +1015,11 @@ again:
ref = list_first_entry(&prefs, struct __prelim_ref, list);
WARN_ON(ref->count < 0);
if (roots && ref->count && ref->root_id && ref->parent == 0) {
+ if (root_objectid && ref->root_id != root_objectid) {
+ ret = BACKREF_FOUND_SHARED;
+ goto out;
+ }
+
/* no parent == root of tree */
ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
if (ret < 0)
@@ -989,12 +1028,10 @@ again:
if (ref->count && ref->parent) {
if (extent_item_pos && !ref->inode_list &&
ref->level == 0) {
- u32 bsz;
struct extent_buffer *eb;
- bsz = btrfs_level_size(fs_info->extent_root,
- ref->level);
+
eb = read_tree_block(fs_info->extent_root,
- ref->parent, bsz, 0);
+ ref->parent, 0);
if (!eb || !extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
ret = -EIO;
@@ -1087,7 +1124,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
return -ENOMEM;
ret = find_parent_nodes(trans, fs_info, bytenr,
- time_seq, *leafs, NULL, extent_item_pos);
+ time_seq, *leafs, NULL, extent_item_pos, 0, 0);
if (ret < 0 && ret != -ENOENT) {
free_leaf_list(*leafs);
return ret;
@@ -1130,7 +1167,7 @@ static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans,
ULIST_ITER_INIT(&uiter);
while (1) {
ret = find_parent_nodes(trans, fs_info, bytenr,
- time_seq, tmp, *roots, NULL);
+ time_seq, tmp, *roots, NULL, 0, 0);
if (ret < 0 && ret != -ENOENT) {
ulist_free(tmp);
ulist_free(*roots);
@@ -1161,6 +1198,54 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
return ret;
}
+int btrfs_check_shared(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 root_objectid,
+ u64 inum, u64 bytenr)
+{
+ struct ulist *tmp = NULL;
+ struct ulist *roots = NULL;
+ struct ulist_iterator uiter;
+ struct ulist_node *node;
+ struct seq_list elem = {};
+ int ret = 0;
+
+ tmp = ulist_alloc(GFP_NOFS);
+ roots = ulist_alloc(GFP_NOFS);
+ if (!tmp || !roots) {
+ ulist_free(tmp);
+ ulist_free(roots);
+ return -ENOMEM;
+ }
+
+ if (trans)
+ btrfs_get_tree_mod_seq(fs_info, &elem);
+ else
+ down_read(&fs_info->commit_root_sem);
+ ULIST_ITER_INIT(&uiter);
+ while (1) {
+ ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
+ roots, NULL, root_objectid, inum);
+ if (ret == BACKREF_FOUND_SHARED) {
+ ret = 1;
+ break;
+ }
+ if (ret < 0 && ret != -ENOENT)
+ break;
+ node = ulist_next(tmp, &uiter);
+ if (!node)
+ break;
+ bytenr = node->val;
+ cond_resched();
+ }
+ if (trans)
+ btrfs_put_tree_mod_seq(fs_info, &elem);
+ else
+ up_read(&fs_info->commit_root_sem);
+ ulist_free(tmp);
+ ulist_free(roots);
+ return ret;
+}
+
/*
* this makes the path point to (inum INODE_ITEM ioff)
*/
@@ -1193,7 +1278,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
unsigned long ptr;
key.objectid = inode_objectid;
- btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+ key.type = BTRFS_INODE_EXTREF_KEY;
key.offset = start_off;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -1233,7 +1318,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
ret = -ENOENT;
if (found_key.objectid != inode_objectid)
break;
- if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY)
+ if (found_key.type != BTRFS_INODE_EXTREF_KEY)
break;
ret = 0;
@@ -1366,7 +1451,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
}
btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
if (found_key->type == BTRFS_METADATA_ITEM_KEY)
- size = fs_info->extent_root->leafsize;
+ size = fs_info->extent_root->nodesize;
else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
size = found_key->offset;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 86fc20fec282..2a1ac6bfc724 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -71,6 +71,9 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
u64 start_off, struct btrfs_path *path,
struct btrfs_inode_extref **ret_extref,
u64 *found_off);
+int btrfs_check_shared(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 root_objectid,
+ u64 inum, u64 bytenr);
int __init btrfs_prelim_ref_init(void);
void btrfs_prelim_ref_exit(void);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 56b8522d5767..4aadadcfab20 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -44,6 +44,17 @@
#define BTRFS_INODE_IN_DELALLOC_LIST 9
#define BTRFS_INODE_READDIO_NEED_LOCK 10
#define BTRFS_INODE_HAS_PROPS 11
+/*
+ * The following 3 bits are meant only for the btree inode.
+ * When any of them is set, it means an error happened while writing an
+ * extent buffer belonging to:
+ * 1) a non-log btree
+ * 2) a log btree and first log sub-transaction
+ * 3) a log btree and second log sub-transaction
+ */
+#define BTRFS_INODE_BTREE_ERR 12
+#define BTRFS_INODE_BTREE_LOG1_ERR 13
+#define BTRFS_INODE_BTREE_LOG2_ERR 14
/* in memory btrfs inode */
struct btrfs_inode {
@@ -121,6 +132,12 @@ struct btrfs_inode {
u64 delalloc_bytes;
/*
+ * total number of bytes pending defrag, used by stat to check whether
+ * it needs COW.
+ */
+ u64 defrag_bytes;
+
+ /*
* the size of the file stored in the metadata on disk. data=ordered
* means the in-memory i_size might be larger than the size on disk
* because not all the blocks are written yet.
@@ -248,8 +265,11 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
return 0;
}
+#define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1
+
struct btrfs_dio_private {
struct inode *inode;
+ unsigned long flags;
u64 logical_offset;
u64 disk_bytenr;
u64 bytes;
@@ -266,7 +286,12 @@ struct btrfs_dio_private {
/* dio_bio came from fs/direct-io.c */
struct bio *dio_bio;
- u8 csum[0];
+
+ /*
+ * The original bio may be splited to several sub-bios, this is
+ * done during endio of sub-bios
+ */
+ int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int);
};
/*
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index ce92ae30250f..cb7f3fe9c9f6 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -807,7 +807,7 @@ static int btrfsic_process_superblock_dev_mirror(
/* super block bytenr is always the unmapped device bytenr */
dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
- if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
+ if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes)
return -1;
bh = __bread(superblock_bdev, dev_bytenr / 4096,
BTRFS_SUPER_INFO_SIZE);
@@ -820,7 +820,6 @@ static int btrfsic_process_superblock_dev_mirror(
btrfs_super_magic(super_tmp) != BTRFS_MAGIC ||
memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
btrfs_super_nodesize(super_tmp) != state->metablock_size ||
- btrfs_super_leafsize(super_tmp) != state->metablock_size ||
btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
brelse(bh);
return 0;
@@ -1252,8 +1251,7 @@ static void btrfsic_read_from_block_data(
while (len > 0) {
cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page));
- BUG_ON(i >= (block_ctx->len + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT);
+ BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_CACHE_SIZE));
kaddr = block_ctx->datav[i];
memcpy(dst, kaddr + offset_in_page, cur);
@@ -3120,24 +3118,12 @@ int btrfsic_mount(struct btrfs_root *root,
struct list_head *dev_head = &fs_devices->devices;
struct btrfs_device *device;
- if (root->nodesize != root->leafsize) {
- printk(KERN_INFO
- "btrfsic: cannot handle nodesize %d != leafsize %d!\n",
- root->nodesize, root->leafsize);
- return -1;
- }
if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) {
printk(KERN_INFO
"btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
root->nodesize, PAGE_CACHE_SIZE);
return -1;
}
- if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) {
- printk(KERN_INFO
- "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
- root->leafsize, PAGE_CACHE_SIZE);
- return -1;
- }
if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) {
printk(KERN_INFO
"btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 1daea0b47187..d3220d31d3cb 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -91,8 +91,7 @@ static inline int compressed_bio_size(struct btrfs_root *root,
u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
return sizeof(struct compressed_bio) +
- ((disk_size + root->sectorsize - 1) / root->sectorsize) *
- csum_size;
+ (DIV_ROUND_UP(disk_size, root->sectorsize)) * csum_size;
}
static struct bio *compressed_bio_alloc(struct block_device *bdev,
@@ -389,7 +388,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
* freed before we're done setting it up
*/
atomic_inc(&cb->pending_bios);
- ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio,
+ BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
if (!skip_sum) {
@@ -420,7 +420,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
}
bio_get(bio);
- ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
if (!skip_sum) {
@@ -615,8 +615,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
cb->compress_type = extent_compress_type(bio_flags);
cb->orig_bio = bio;
- nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
- PAGE_CACHE_SIZE;
+ nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE);
cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages,
GFP_NOFS);
if (!cb->compressed_pages)
@@ -670,7 +669,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
PAGE_CACHE_SIZE) {
bio_get(comp_bio);
- ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+ ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio,
+ BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
/*
@@ -686,8 +686,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
comp_bio, sums);
BUG_ON(ret); /* -ENOMEM */
}
- sums += (comp_bio->bi_iter.bi_size +
- root->sectorsize - 1) / root->sectorsize;
+ sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
+ root->sectorsize);
ret = btrfs_map_bio(root, READ, comp_bio,
mirror_num, 0);
@@ -708,7 +708,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
}
bio_get(comp_bio);
- ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+ ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio,
+ BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 44ee5d2e52a4..19bc6162fb8e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -258,9 +258,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
else
btrfs_node_key(buf, &disk_key, 0);
- cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
- new_root_objectid, &disk_key, level,
- buf->start, 0);
+ cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
+ &disk_key, level, buf->start, 0);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -1133,9 +1132,9 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
} else
parent_start = 0;
- cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
- root->root_key.objectid, &disk_key,
- level, search_start, empty_size);
+ cow = btrfs_alloc_tree_block(trans, root, parent_start,
+ root->root_key.objectid, &disk_key, level,
+ search_start, empty_size);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -1425,7 +1424,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
struct tree_mod_root *old_root = NULL;
u64 old_generation = 0;
u64 logical;
- u32 blocksize;
eb_root = btrfs_read_lock_root_node(root);
tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
@@ -1444,8 +1442,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
- blocksize = btrfs_level_size(root, old_root->level);
- old = read_tree_block(root, logical, blocksize, 0);
+ old = read_tree_block(root, logical, 0);
if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
free_extent_buffer(old);
btrfs_warn(root->fs_info,
@@ -1506,10 +1503,9 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf)
{
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ if (btrfs_test_is_dummy_root(root))
return 0;
-#endif
+
/* ensure we can see the force_cow */
smp_rmb();
@@ -1651,7 +1647,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
WARN_ON(trans->transid != root->fs_info->generation);
parent_nritems = btrfs_header_nritems(parent);
- blocksize = btrfs_level_size(root, parent_level - 1);
+ blocksize = root->nodesize;
end_slot = parent_nritems;
if (parent_nritems == 1)
@@ -1685,15 +1681,14 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
continue;
}
- cur = btrfs_find_tree_block(root, blocknr, blocksize);
+ cur = btrfs_find_tree_block(root, blocknr);
if (cur)
uptodate = btrfs_buffer_uptodate(cur, gen, 0);
else
uptodate = 0;
if (!cur || !uptodate) {
if (!cur) {
- cur = read_tree_block(root, blocknr,
- blocksize, gen);
+ cur = read_tree_block(root, blocknr, gen);
if (!cur || !extent_buffer_uptodate(cur)) {
free_extent_buffer(cur);
return -EIO;
@@ -1872,7 +1867,6 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
BUG_ON(level == 0);
eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
- btrfs_level_size(root, level - 1),
btrfs_node_ptr_generation(parent, slot));
if (eb && !extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
@@ -2267,8 +2261,8 @@ static void reada_for_search(struct btrfs_root *root,
node = path->nodes[level];
search = btrfs_node_blockptr(node, slot);
- blocksize = btrfs_level_size(root, level - 1);
- eb = btrfs_find_tree_block(root, search, blocksize);
+ blocksize = root->nodesize;
+ eb = btrfs_find_tree_block(root, search);
if (eb) {
free_extent_buffer(eb);
return;
@@ -2298,7 +2292,7 @@ static void reada_for_search(struct btrfs_root *root,
if ((search <= target && target - search <= 65536) ||
(search > target && search - target <= 65536)) {
gen = btrfs_node_ptr_generation(node, nr);
- readahead_tree_block(root, search, blocksize, gen);
+ readahead_tree_block(root, search, blocksize);
nread += blocksize;
}
nscan++;
@@ -2325,12 +2319,12 @@ static noinline void reada_for_balance(struct btrfs_root *root,
nritems = btrfs_header_nritems(parent);
slot = path->slots[level + 1];
- blocksize = btrfs_level_size(root, level);
+ blocksize = root->nodesize;
if (slot > 0) {
block1 = btrfs_node_blockptr(parent, slot - 1);
gen = btrfs_node_ptr_generation(parent, slot - 1);
- eb = btrfs_find_tree_block(root, block1, blocksize);
+ eb = btrfs_find_tree_block(root, block1);
/*
* if we get -eagain from btrfs_buffer_uptodate, we
* don't want to return eagain here. That will loop
@@ -2343,16 +2337,16 @@ static noinline void reada_for_balance(struct btrfs_root *root,
if (slot + 1 < nritems) {
block2 = btrfs_node_blockptr(parent, slot + 1);
gen = btrfs_node_ptr_generation(parent, slot + 1);
- eb = btrfs_find_tree_block(root, block2, blocksize);
+ eb = btrfs_find_tree_block(root, block2);
if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
block2 = 0;
free_extent_buffer(eb);
}
if (block1)
- readahead_tree_block(root, block1, blocksize, 0);
+ readahead_tree_block(root, block1, blocksize);
if (block2)
- readahead_tree_block(root, block2, blocksize, 0);
+ readahead_tree_block(root, block2, blocksize);
}
@@ -2454,16 +2448,14 @@ read_block_for_search(struct btrfs_trans_handle *trans,
{
u64 blocknr;
u64 gen;
- u32 blocksize;
struct extent_buffer *b = *eb_ret;
struct extent_buffer *tmp;
int ret;
blocknr = btrfs_node_blockptr(b, slot);
gen = btrfs_node_ptr_generation(b, slot);
- blocksize = btrfs_level_size(root, level - 1);
- tmp = btrfs_find_tree_block(root, blocknr, blocksize);
+ tmp = btrfs_find_tree_block(root, blocknr);
if (tmp) {
/* first we do an atomic uptodate check */
if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
@@ -2507,7 +2499,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
btrfs_release_path(p);
ret = -EAGAIN;
- tmp = read_tree_block(root, blocknr, blocksize, 0);
+ tmp = read_tree_block(root, blocknr, 0);
if (tmp) {
/*
* If the read above didn't mark this buffer up to date,
@@ -2792,8 +2784,6 @@ again:
if (!should_cow_block(trans, root, b))
goto cow_done;
- btrfs_set_path_blocking(p);
-
/*
* must have write locks on this node and the
* parent
@@ -2807,6 +2797,7 @@ again:
goto again;
}
+ btrfs_set_path_blocking(p);
err = btrfs_cow_block(trans, root, b,
p->nodes[level + 1],
p->slots[level + 1], &b);
@@ -3362,9 +3353,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
else
btrfs_node_key(lower, &lower_key, 0);
- c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
- root->root_key.objectid, &lower_key,
- level, root->node->start, 0);
+ c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &lower_key, level, root->node->start, 0);
if (IS_ERR(c))
return PTR_ERR(c);
@@ -3502,9 +3492,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
mid = (c_nritems + 1) / 2;
btrfs_node_key(c, &disk_key, mid);
- split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
- root->root_key.objectid,
- &disk_key, level, c->start, 0);
+ split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &disk_key, level, c->start, 0);
if (IS_ERR(split))
return PTR_ERR(split);
@@ -4282,13 +4271,12 @@ again:
else
btrfs_item_key(l, &disk_key, mid);
- right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
- root->root_key.objectid,
- &disk_key, 0, l->start, 0);
+ right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &disk_key, 0, l->start, 0);
if (IS_ERR(right))
return PTR_ERR(right);
- root_add_used(root, root->leafsize);
+ root_add_used(root, root->nodesize);
memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
btrfs_set_header_bytenr(right, right->start);
@@ -4626,8 +4614,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
ptr = btrfs_item_ptr_offset(leaf, slot);
memmove_extent_buffer(leaf, ptr,
(unsigned long)fi,
- offsetof(struct btrfs_file_extent_item,
- disk_bytenr));
+ BTRFS_FILE_EXTENT_INLINE_DATA_START);
}
}
@@ -4738,6 +4725,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
int slot;
struct btrfs_map_token token;
+ if (path->slots[0] == 0) {
+ btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+ fixup_low_keys(root, path, &disk_key, 1);
+ }
+ btrfs_unlock_up_safe(path, 1);
+
btrfs_init_map_token(&token);
leaf = path->nodes[0];
@@ -4798,12 +4791,6 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
}
btrfs_set_header_nritems(leaf, nritems + nr);
-
- if (slot == 0) {
- btrfs_cpu_key_to_disk(&disk_key, cpu_key);
- fixup_low_keys(root, path, &disk_key, 1);
- }
- btrfs_unlock_up_safe(path, 1);
btrfs_mark_buffer_dirty(leaf);
if (btrfs_leaf_free_space(root, leaf) < 0) {
@@ -5145,8 +5132,9 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
u32 nritems;
int level;
int ret = 1;
+ int keep_locks = path->keep_locks;
- WARN_ON(!path->keep_locks);
+ path->keep_locks = 1;
again:
cur = btrfs_read_lock_root_node(root);
level = btrfs_header_level(cur);
@@ -5210,7 +5198,6 @@ find_next_key:
path->slots[level] = slot;
if (level == path->lowest_level) {
ret = 0;
- unlock_up(path, level, 1, 0, NULL);
goto out;
}
btrfs_set_path_blocking(path);
@@ -5225,9 +5212,12 @@ find_next_key:
btrfs_clear_path_blocking(path, NULL, 0);
}
out:
- if (ret == 0)
+ path->keep_locks = keep_locks;
+ if (ret == 0) {
+ btrfs_unlock_up_safe(path, path->lowest_level + 1);
+ btrfs_set_path_blocking(path);
memcpy(min_key, &found_key, sizeof(found_key));
- btrfs_set_path_blocking(path);
+ }
return ret;
}
@@ -5375,7 +5365,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
goto out;
}
- tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS);
+ tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS);
if (!tmp_buf) {
ret = -ENOMEM;
goto out;
@@ -5520,18 +5510,18 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
goto out;
advance_right = ADVANCE;
} else {
- enum btrfs_compare_tree_result cmp;
+ enum btrfs_compare_tree_result result;
WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
ret = tree_compare_item(left_root, left_path,
right_path, tmp_buf);
if (ret)
- cmp = BTRFS_COMPARE_TREE_CHANGED;
+ result = BTRFS_COMPARE_TREE_CHANGED;
else
- cmp = BTRFS_COMPARE_TREE_SAME;
+ result = BTRFS_COMPARE_TREE_SAME;
ret = changed_cb(left_root, right_root,
left_path, right_path,
- &left_key, cmp, ctx);
+ &left_key, result, ctx);
if (ret < 0)
goto out;
advance_left = ADVANCE;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8e29b614fe93..d557264ee974 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -34,6 +34,7 @@
#include <linux/pagemap.h>
#include <linux/btrfs.h>
#include <linux/workqueue.h>
+#include <linux/security.h>
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
@@ -62,13 +63,6 @@ struct btrfs_ordered_sum;
#define BTRFS_COMPAT_EXTENT_TREE_V0
-/*
- * files bigger than this get some pre-flushing when they are added
- * to the ordered operations list. That way we limit the total
- * work done by the commit
- */
-#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
-
/* holds pointers to all of the tree roots */
#define BTRFS_ROOT_TREE_OBJECTID 1ULL
@@ -391,10 +385,12 @@ struct btrfs_header {
sizeof(struct btrfs_header)) / \
sizeof(struct btrfs_key_ptr))
#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
-#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
+#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->nodesize))
+#define BTRFS_FILE_EXTENT_INLINE_DATA_START \
+ (offsetof(struct btrfs_file_extent_item, disk_bytenr))
#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
sizeof(struct btrfs_item) - \
- sizeof(struct btrfs_file_extent_item))
+ BTRFS_FILE_EXTENT_INLINE_DATA_START)
#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
sizeof(struct btrfs_item) -\
sizeof(struct btrfs_dir_item))
@@ -474,7 +470,7 @@ struct btrfs_super_block {
__le64 num_devices;
__le32 sectorsize;
__le32 nodesize;
- __le32 leafsize;
+ __le32 __unused_leafsize;
__le32 stripesize;
__le32 sys_chunk_array_size;
__le64 chunk_root_generation;
@@ -903,6 +899,8 @@ struct btrfs_file_extent_item {
/*
* disk space consumed by the extent, checksum blocks are included
* in these numbers
+ *
+ * At this offset in the structure, the inline extent data start.
*/
__le64 disk_bytenr;
__le64 disk_num_bytes;
@@ -1305,8 +1303,8 @@ struct btrfs_block_group_cache {
*/
struct list_head cluster_list;
- /* For delayed block group creation */
- struct list_head new_bg_list;
+ /* For delayed block group creation or deletion of empty block groups */
+ struct list_head bg_list;
};
/* delayed seq elem */
@@ -1545,6 +1543,7 @@ struct btrfs_fs_info {
struct btrfs_workqueue *endio_workers;
struct btrfs_workqueue *endio_meta_workers;
struct btrfs_workqueue *endio_raid56_workers;
+ struct btrfs_workqueue *endio_repair_workers;
struct btrfs_workqueue *rmw_workers;
struct btrfs_workqueue *endio_meta_write_workers;
struct btrfs_workqueue *endio_write_workers;
@@ -1574,6 +1573,7 @@ struct btrfs_fs_info {
int do_barriers;
int closing;
int log_root_recovering;
+ int open;
u64 total_pinned;
@@ -1723,6 +1723,12 @@ struct btrfs_fs_info {
/* Used to reclaim the metadata space in the background. */
struct work_struct async_reclaim_work;
+
+ spinlock_t unused_bgs_lock;
+ struct list_head unused_bgs;
+
+ /* For btrfs to record security options */
+ struct security_mnt_opts security_opts;
};
struct btrfs_subvolume_writers {
@@ -1776,12 +1782,12 @@ struct btrfs_root {
/* free ino cache stuff */
struct btrfs_free_space_ctl *free_ino_ctl;
- enum btrfs_caching_type cached;
- spinlock_t cache_lock;
- wait_queue_head_t cache_wait;
+ enum btrfs_caching_type ino_cache_state;
+ spinlock_t ino_cache_lock;
+ wait_queue_head_t ino_cache_wait;
struct btrfs_free_space_ctl *free_ino_pinned;
- u64 cache_progress;
- struct inode *cache_inode;
+ u64 ino_cache_progress;
+ struct inode *ino_cache_inode;
struct mutex log_mutex;
wait_queue_head_t log_writer_wait;
@@ -1806,18 +1812,14 @@ struct btrfs_root {
/* node allocations are done in nodesize units */
u32 nodesize;
- /* leaf allocations are done in leafsize units */
- u32 leafsize;
-
u32 stripesize;
u32 type;
u64 highest_objectid;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */
u64 alloc_bytenr;
-#endif
u64 defrag_trans_start;
struct btrfs_key defrag_progress;
@@ -2094,6 +2096,7 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24)
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
+#define BTRFS_DEFAULT_MAX_INLINE (8192)
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -2995,8 +2998,6 @@ BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
sectorsize, 32);
BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
nodesize, 32);
-BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block,
- leafsize, 32);
BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
stripesize, 32);
BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
@@ -3049,14 +3050,12 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
static inline unsigned long
btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
{
- unsigned long offset = (unsigned long)e;
- offset += offsetof(struct btrfs_file_extent_item, disk_bytenr);
- return offset;
+ return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START;
}
static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
{
- return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
+ return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize;
}
BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
@@ -3086,9 +3085,7 @@ BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
struct btrfs_item *e)
{
- unsigned long offset;
- offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
- return btrfs_item_size(eb, e) - offset;
+ return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
}
/* this returns the number of file bytes represented by the inline item.
@@ -3232,13 +3229,6 @@ static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
return sb->s_fs_info;
}
-static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
-{
- if (level == 0)
- return root->leafsize;
- return root->nodesize;
-}
-
/* helper function to cast into the data area of the leaf. */
#define btrfs_item_ptr(leaf, slot, type) \
((type *)(btrfs_leaf_data(leaf) + \
@@ -3263,7 +3253,7 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
unsigned num_items)
{
- return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+ return (root->nodesize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
2 * num_items;
}
@@ -3274,8 +3264,7 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
unsigned num_items)
{
- return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
- num_items;
+ return root->nodesize * BTRFS_MAX_LEVEL * num_items;
}
int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
@@ -3305,9 +3294,9 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
u64 bytenr);
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int get_block_group_index(struct btrfs_block_group_cache *cache);
-struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u32 blocksize,
- u64 parent, u64 root_objectid,
+struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 parent,
+ u64 root_objectid,
struct btrfs_disk_key *key, int level,
u64 hint, u64 empty_size);
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
@@ -3363,6 +3352,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
u64 size);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 group_start);
+void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
@@ -3604,6 +3594,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
kfree(fs_info->uuid_root);
kfree(fs_info->super_copy);
kfree(fs_info->super_for_commit);
+ security_free_mnt_opts(&fs_info->security_opts);
kfree(fs_info);
}
@@ -3739,8 +3730,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
struct bio *bio, u32 *dst);
int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
- struct btrfs_dio_private *dip, struct bio *bio,
- u64 logical_offset);
+ struct bio *bio, u64 logical_offset);
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 pos,
@@ -4141,8 +4131,15 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
/* Sanity test specific functions */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_destroy_inode(struct inode *inode);
-int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
- u64 rfer, u64 excl);
#endif
+static inline int btrfs_test_is_dummy_root(struct btrfs_root *root)
+{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ return 1;
+#endif
+ return 0;
+}
+
#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index a2e90f855d7d..054577bddaf2 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1042,7 +1042,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
int ret;
key.objectid = node->inode_id;
- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
@@ -1099,7 +1099,7 @@ err_out:
search:
btrfs_release_path(path);
- btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+ key.type = BTRFS_INODE_EXTREF_KEY;
key.offset = -1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
@@ -1473,7 +1473,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
}
delayed_item->key.objectid = btrfs_ino(dir);
- btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY);
+ delayed_item->key.type = BTRFS_DIR_INDEX_KEY;
delayed_item->key.offset = index;
dir_item = (struct btrfs_dir_item *)delayed_item->data;
@@ -1542,7 +1542,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
return PTR_ERR(node);
item_key.objectid = btrfs_ino(dir);
- btrfs_set_key_type(&item_key, BTRFS_DIR_INDEX_KEY);
+ item_key.type = BTRFS_DIR_INDEX_KEY;
item_key.offset = index;
ret = btrfs_delete_delayed_insertion_item(root, node, &item_key);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index eea26e1b2fda..6f662b34ba0e 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -168,8 +168,12 @@ no_valid_dev_replace_entry_found:
dev_replace->srcdev->total_bytes;
dev_replace->tgtdev->disk_total_bytes =
dev_replace->srcdev->disk_total_bytes;
+ dev_replace->tgtdev->commit_total_bytes =
+ dev_replace->srcdev->commit_total_bytes;
dev_replace->tgtdev->bytes_used =
dev_replace->srcdev->bytes_used;
+ dev_replace->tgtdev->commit_bytes_used =
+ dev_replace->srcdev->commit_bytes_used;
}
dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
@@ -329,30 +333,34 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
args->start.tgtdev_name[0] == '\0')
return -EINVAL;
- mutex_lock(&fs_info->volume_mutex);
- ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
- &tgt_device);
- if (ret) {
- btrfs_err(fs_info, "target device %s is invalid!",
- args->start.tgtdev_name);
- mutex_unlock(&fs_info->volume_mutex);
- return -EINVAL;
+ /*
+ * Here we commit the transaction to make sure commit_total_bytes
+ * of all the devices are updated.
+ */
+ trans = btrfs_attach_transaction(root);
+ if (!IS_ERR(trans)) {
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret)
+ return ret;
+ } else if (PTR_ERR(trans) != -ENOENT) {
+ return PTR_ERR(trans);
}
+ /* the disk copy procedure reuses the scrub code */
+ mutex_lock(&fs_info->volume_mutex);
ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
args->start.srcdev_name,
&src_device);
- mutex_unlock(&fs_info->volume_mutex);
if (ret) {
- ret = -EINVAL;
- goto leave_no_lock;
+ mutex_unlock(&fs_info->volume_mutex);
+ return ret;
}
- if (tgt_device->total_bytes < src_device->total_bytes) {
- btrfs_err(fs_info, "target device is smaller than source device!");
- ret = -EINVAL;
- goto leave_no_lock;
- }
+ ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
+ src_device, &tgt_device);
+ mutex_unlock(&fs_info->volume_mutex);
+ if (ret)
+ return ret;
btrfs_dev_replace_lock(dev_replace);
switch (dev_replace->replace_state) {
@@ -380,10 +388,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
src_device->devid,
rcu_str_deref(tgt_device->name));
- tgt_device->total_bytes = src_device->total_bytes;
- tgt_device->disk_total_bytes = src_device->disk_total_bytes;
- tgt_device->bytes_used = src_device->bytes_used;
-
/*
* from now on, the writes to the srcdev are all duplicated to
* go to the tgtdev as well (refer to btrfs_map_block()).
@@ -414,7 +418,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
/* the disk copy procedure reuses the scrub code */
ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
- src_device->total_bytes,
+ btrfs_device_get_total_bytes(src_device),
&dev_replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(root->fs_info, ret);
@@ -426,9 +430,7 @@ leave:
dev_replace->srcdev = NULL;
dev_replace->tgtdev = NULL;
btrfs_dev_replace_unlock(dev_replace);
-leave_no_lock:
- if (tgt_device)
- btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+ btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
return ret;
}
@@ -507,9 +509,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
ret = btrfs_commit_transaction(trans, root);
WARN_ON(ret);
+ mutex_lock(&uuid_mutex);
/* keep away write_all_supers() during the finishing procedure */
- mutex_lock(&root->fs_info->chunk_mutex);
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&root->fs_info->chunk_mutex);
btrfs_dev_replace_lock(dev_replace);
dev_replace->replace_state =
scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
@@ -532,8 +535,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
src_device->devid,
rcu_str_deref(tgt_device->name), scrub_ret);
btrfs_dev_replace_unlock(dev_replace);
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
mutex_unlock(&root->fs_info->chunk_mutex);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&uuid_mutex);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
@@ -542,7 +546,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
}
printk_in_rcu(KERN_INFO
- "BTRFS: dev_replace from %s (devid %llu) to %s) finished\n",
+ "BTRFS: dev_replace from %s (devid %llu) to %s finished\n",
src_device->missing ? "<missing disk>" :
rcu_str_deref(src_device->name),
src_device->devid,
@@ -550,23 +554,29 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
tgt_device->is_tgtdev_for_dev_replace = 0;
tgt_device->devid = src_device->devid;
src_device->devid = BTRFS_DEV_REPLACE_DEVID;
- tgt_device->bytes_used = src_device->bytes_used;
memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
- tgt_device->total_bytes = src_device->total_bytes;
- tgt_device->disk_total_bytes = src_device->disk_total_bytes;
- tgt_device->bytes_used = src_device->bytes_used;
+ btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
+ btrfs_device_set_disk_total_bytes(tgt_device,
+ src_device->disk_total_bytes);
+ btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
+ ASSERT(list_empty(&src_device->resized_list));
+ tgt_device->commit_total_bytes = src_device->commit_total_bytes;
+ tgt_device->commit_bytes_used = src_device->bytes_used;
if (fs_info->sb->s_bdev == src_device->bdev)
fs_info->sb->s_bdev = tgt_device->bdev;
if (fs_info->fs_devices->latest_bdev == src_device->bdev)
fs_info->fs_devices->latest_bdev = tgt_device->bdev;
list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
+ fs_info->fs_devices->rw_devices++;
/* replace the sysfs entry */
btrfs_kobj_rm_device(fs_info, src_device);
btrfs_kobj_add_device(fs_info, tgt_device);
+ btrfs_dev_replace_unlock(dev_replace);
+
btrfs_rm_dev_replace_blocked(fs_info);
btrfs_rm_dev_replace_srcdev(fs_info, src_device);
@@ -580,9 +590,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
* superblock is scratched out so that it is no longer marked to
* belong to this filesystem.
*/
- btrfs_dev_replace_unlock(dev_replace);
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
mutex_unlock(&root->fs_info->chunk_mutex);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&uuid_mutex);
/* write back the superblocks */
trans = btrfs_start_transaction(root, 0);
@@ -643,6 +653,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_device *srcdev;
btrfs_dev_replace_lock(dev_replace);
/* even if !dev_replace_is_valid, the values are good enough for
@@ -665,8 +676,9 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ srcdev = dev_replace->srcdev;
args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
- div64_u64(dev_replace->srcdev->total_bytes, 1000));
+ div64_u64(btrfs_device_get_total_bytes(srcdev), 1000));
break;
}
btrfs_dev_replace_unlock(dev_replace);
@@ -825,7 +837,7 @@ static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
dev_replace->committed_cursor_left,
- dev_replace->srcdev->total_bytes,
+ btrfs_device_get_total_bytes(dev_replace->srcdev),
&dev_replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(fs_info, ret);
WARN_ON(ret);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index a0691df5dcea..fc8df866e919 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -86,7 +86,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root));
key.objectid = objectid;
- btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+ key.type = BTRFS_XATTR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
data_size = sizeof(*dir_item) + name_len + data_len;
@@ -137,7 +137,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
u32 data_size;
key.objectid = btrfs_ino(dir);
- btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+ key.type = BTRFS_DIR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
path = btrfs_alloc_path();
@@ -204,7 +204,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
int cow = mod != 0;
key.objectid = dir;
- btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+ key.type = BTRFS_DIR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
@@ -234,7 +234,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
return -ENOMEM;
key.objectid = dir;
- btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+ key.type = BTRFS_DIR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -297,7 +297,7 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
int cow = mod != 0;
key.objectid = dir;
- btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+ key.type = BTRFS_DIR_INDEX_KEY;
key.offset = objectid;
ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
@@ -367,7 +367,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
int cow = mod != 0;
key.objectid = dir;
- btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+ key.type = BTRFS_XATTR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
if (ret < 0)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a1d36e62179c..fa45e3cae40d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -72,21 +72,41 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root);
static void btrfs_error_commit_super(struct btrfs_root *root);
/*
- * end_io_wq structs are used to do processing in task context when an IO is
- * complete. This is used during reads to verify checksums, and it is used
+ * btrfs_end_io_wq structs are used to do processing in task context when an IO
+ * is complete. This is used during reads to verify checksums, and it is used
* by writes to insert metadata for new file extents after IO is complete.
*/
-struct end_io_wq {
+struct btrfs_end_io_wq {
struct bio *bio;
bio_end_io_t *end_io;
void *private;
struct btrfs_fs_info *info;
int error;
- int metadata;
+ enum btrfs_wq_endio_type metadata;
struct list_head list;
struct btrfs_work work;
};
+static struct kmem_cache *btrfs_end_io_wq_cache;
+
+int __init btrfs_end_io_wq_init(void)
+{
+ btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
+ sizeof(struct btrfs_end_io_wq),
+ 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ NULL);
+ if (!btrfs_end_io_wq_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void btrfs_end_io_wq_exit(void)
+{
+ if (btrfs_end_io_wq_cache)
+ kmem_cache_destroy(btrfs_end_io_wq_cache);
+}
+
/*
* async submit bios are used to offload expensive checksumming
* onto the worker threads. They checksum file and metadata bios
@@ -327,8 +347,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
{
struct extent_state *cached_state = NULL;
int ret;
- bool need_lock = (current->journal_info ==
- (void *)BTRFS_SEND_TRANS_STUB);
+ bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
return 0;
@@ -348,9 +367,9 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
ret = 0;
goto out;
}
- printk_ratelimited("parent transid verify failed on %llu wanted %llu "
- "found %llu\n",
- eb->start, parent_transid, btrfs_header_generation(eb));
+ printk_ratelimited(KERN_INFO "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
+ eb->fs_info->sb->s_id, eb->start,
+ parent_transid, btrfs_header_generation(eb));
ret = 1;
/*
@@ -607,22 +626,22 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
goto err;
eb->read_mirror = mirror;
- if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
+ if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
ret = -EIO;
goto err;
}
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
- printk_ratelimited(KERN_INFO "BTRFS: bad tree block start "
+ printk_ratelimited(KERN_INFO "BTRFS (device %s): bad tree block start "
"%llu %llu\n",
- found_start, eb->start);
+ eb->fs_info->sb->s_id, found_start, eb->start);
ret = -EIO;
goto err;
}
if (check_tree_block_fsid(root, eb)) {
- printk_ratelimited(KERN_INFO "BTRFS: bad fsid on block %llu\n",
- eb->start);
+ printk_ratelimited(KERN_INFO "BTRFS (device %s): bad fsid on block %llu\n",
+ eb->fs_info->sb->s_id, eb->start);
ret = -EIO;
goto err;
}
@@ -680,7 +699,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
eb = (struct extent_buffer *)page->private;
- set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = failed_mirror;
atomic_dec(&eb->io_pages);
if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
@@ -690,7 +709,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
static void end_workqueue_bio(struct bio *bio, int err)
{
- struct end_io_wq *end_io_wq = bio->bi_private;
+ struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
struct btrfs_fs_info *fs_info;
struct btrfs_workqueue *wq;
btrfs_work_func_t func;
@@ -713,7 +732,11 @@ static void end_workqueue_bio(struct bio *bio, int err)
func = btrfs_endio_write_helper;
}
} else {
- if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
+ if (unlikely(end_io_wq->metadata ==
+ BTRFS_WQ_ENDIO_DIO_REPAIR)) {
+ wq = fs_info->endio_repair_workers;
+ func = btrfs_endio_repair_helper;
+ } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
wq = fs_info->endio_raid56_workers;
func = btrfs_endio_raid56_helper;
} else if (end_io_wq->metadata) {
@@ -729,19 +752,12 @@ static void end_workqueue_bio(struct bio *bio, int err)
btrfs_queue_work(wq, &end_io_wq->work);
}
-/*
- * For the metadata arg you want
- *
- * 0 - if data
- * 1 - if normal metadta
- * 2 - if writing to the free space cache area
- * 3 - raid parity work
- */
int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
- int metadata)
+ enum btrfs_wq_endio_type metadata)
{
- struct end_io_wq *end_io_wq;
- end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
+ struct btrfs_end_io_wq *end_io_wq;
+
+ end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
if (!end_io_wq)
return -ENOMEM;
@@ -925,7 +941,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
* can happen in the async kernel threads
*/
ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
- bio, 1);
+ bio, BTRFS_WQ_ENDIO_METADATA);
if (ret)
goto out_w_error;
ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
@@ -1057,20 +1073,17 @@ static const struct address_space_operations btree_aops = {
.set_page_dirty = btree_set_page_dirty,
};
-int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
- u64 parent_transid)
+void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
{
struct extent_buffer *buf = NULL;
struct inode *btree_inode = root->fs_info->btree_inode;
- int ret = 0;
buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
if (!buf)
- return 0;
+ return;
read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
buf, 0, WAIT_NONE, btree_get_extent, 0);
free_extent_buffer(buf);
- return ret;
}
int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
@@ -1106,7 +1119,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
}
struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
- u64 bytenr, u32 blocksize)
+ u64 bytenr)
{
return find_extent_buffer(root->fs_info, bytenr);
}
@@ -1114,11 +1127,9 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize)
{
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ if (btrfs_test_is_dummy_root(root))
return alloc_test_extent_buffer(root->fs_info, bytenr,
blocksize);
-#endif
return alloc_extent_buffer(root->fs_info, bytenr, blocksize);
}
@@ -1136,12 +1147,12 @@ int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
}
struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
- u32 blocksize, u64 parent_transid)
+ u64 parent_transid)
{
struct extent_buffer *buf = NULL;
int ret;
- buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize);
if (!buf)
return NULL;
@@ -1183,7 +1194,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
if (!writers)
return ERR_PTR(-ENOMEM);
- ret = percpu_counter_init(&writers->counter, 0);
+ ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL);
if (ret < 0) {
kfree(writers);
return ERR_PTR(ret);
@@ -1200,16 +1211,14 @@ btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
kfree(writers);
}
-static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
- u32 stripesize, struct btrfs_root *root,
- struct btrfs_fs_info *fs_info,
+static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
+ struct btrfs_root *root, struct btrfs_fs_info *fs_info,
u64 objectid)
{
root->node = NULL;
root->commit_root = NULL;
root->sectorsize = sectorsize;
root->nodesize = nodesize;
- root->leafsize = leafsize;
root->stripesize = stripesize;
root->state = 0;
root->orphan_cleanup_state = 0;
@@ -1295,7 +1304,7 @@ struct btrfs_root *btrfs_alloc_dummy_root(void)
root = btrfs_alloc_root(NULL);
if (!root)
return ERR_PTR(-ENOMEM);
- __setup_root(4096, 4096, 4096, 4096, root, NULL, 1);
+ __setup_root(4096, 4096, 4096, root, NULL, 1);
set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state);
root->alloc_bytenr = 0;
@@ -1318,15 +1327,13 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
if (!root)
return ERR_PTR(-ENOMEM);
- __setup_root(tree_root->nodesize, tree_root->leafsize,
- tree_root->sectorsize, tree_root->stripesize,
- root, fs_info, objectid);
+ __setup_root(tree_root->nodesize, tree_root->sectorsize,
+ tree_root->stripesize, root, fs_info, objectid);
root->root_key.objectid = objectid;
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = 0;
- leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
- 0, objectid, NULL, 0, 0, 0);
+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
leaf = NULL;
@@ -1396,9 +1403,9 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
if (!root)
return ERR_PTR(-ENOMEM);
- __setup_root(tree_root->nodesize, tree_root->leafsize,
- tree_root->sectorsize, tree_root->stripesize,
- root, fs_info, BTRFS_TREE_LOG_OBJECTID);
+ __setup_root(tree_root->nodesize, tree_root->sectorsize,
+ tree_root->stripesize, root, fs_info,
+ BTRFS_TREE_LOG_OBJECTID);
root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
@@ -1413,9 +1420,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
* updated (along with back refs to the log tree).
*/
- leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
- BTRFS_TREE_LOG_OBJECTID, NULL,
- 0, 0, 0);
+ leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
+ NULL, 0, 0, 0);
if (IS_ERR(leaf)) {
kfree(root);
return ERR_CAST(leaf);
@@ -1465,7 +1471,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
btrfs_set_stack_inode_generation(inode_item, 1);
btrfs_set_stack_inode_size(inode_item, 3);
btrfs_set_stack_inode_nlink(inode_item, 1);
- btrfs_set_stack_inode_nbytes(inode_item, root->leafsize);
+ btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
btrfs_set_root_node(&log_root->root_item, log_root->node);
@@ -1485,7 +1491,6 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
struct btrfs_fs_info *fs_info = tree_root->fs_info;
struct btrfs_path *path;
u64 generation;
- u32 blocksize;
int ret;
path = btrfs_alloc_path();
@@ -1498,9 +1503,8 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
goto alloc_fail;
}
- __setup_root(tree_root->nodesize, tree_root->leafsize,
- tree_root->sectorsize, tree_root->stripesize,
- root, fs_info, key->objectid);
+ __setup_root(tree_root->nodesize, tree_root->sectorsize,
+ tree_root->stripesize, root, fs_info, key->objectid);
ret = btrfs_find_root(tree_root, key, path,
&root->root_item, &root->root_key);
@@ -1511,9 +1515,8 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
}
generation = btrfs_root_generation(&root->root_item);
- blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
- blocksize, generation);
+ generation);
if (!root->node) {
ret = -ENOMEM;
goto find_fail;
@@ -1573,8 +1576,8 @@ int btrfs_init_fs_root(struct btrfs_root *root)
root->subv_writers = writers;
btrfs_init_free_ino_ctl(root);
- spin_lock_init(&root->cache_lock);
- init_waitqueue_head(&root->cache_wait);
+ spin_lock_init(&root->ino_cache_lock);
+ init_waitqueue_head(&root->ino_cache_wait);
ret = get_anon_bdev(&root->anon_dev);
if (ret)
@@ -1708,10 +1711,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
return ret;
}
-/*
- * If this fails, caller must call bdi_destroy() to get rid of the
- * bdi again.
- */
static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
{
int err;
@@ -1734,16 +1733,16 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
static void end_workqueue_fn(struct btrfs_work *work)
{
struct bio *bio;
- struct end_io_wq *end_io_wq;
+ struct btrfs_end_io_wq *end_io_wq;
int error;
- end_io_wq = container_of(work, struct end_io_wq, work);
+ end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
bio = end_io_wq->bio;
error = end_io_wq->error;
bio->bi_private = end_io_wq->private;
bio->bi_end_io = end_io_wq->end_io;
- kfree(end_io_wq);
+ kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
bio_endio_nodec(bio, error);
}
@@ -1772,6 +1771,7 @@ static int cleaner_kthread(void *arg)
}
btrfs_run_delayed_iputs(root);
+ btrfs_delete_unused_bgs(root->fs_info);
again = btrfs_clean_one_deleted_snapshot(root);
mutex_unlock(&root->fs_info->cleaner_mutex);
@@ -2063,6 +2063,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
btrfs_destroy_workqueue(fs_info->endio_workers);
btrfs_destroy_workqueue(fs_info->endio_meta_workers);
btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
+ btrfs_destroy_workqueue(fs_info->endio_repair_workers);
btrfs_destroy_workqueue(fs_info->rmw_workers);
btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
btrfs_destroy_workqueue(fs_info->endio_write_workers);
@@ -2143,8 +2144,6 @@ int open_ctree(struct super_block *sb,
{
u32 sectorsize;
u32 nodesize;
- u32 leafsize;
- u32 blocksize;
u32 stripesize;
u64 generation;
u64 features;
@@ -2188,7 +2187,7 @@ int open_ctree(struct super_block *sb,
goto fail_srcu;
}
- ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0);
+ ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
if (ret) {
err = ret;
goto fail_bdi;
@@ -2196,13 +2195,13 @@ int open_ctree(struct super_block *sb,
fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
(1 + ilog2(nr_cpu_ids));
- ret = percpu_counter_init(&fs_info->delalloc_bytes, 0);
+ ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
if (ret) {
err = ret;
goto fail_dirty_metadata_bytes;
}
- ret = percpu_counter_init(&fs_info->bio_counter, 0);
+ ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL);
if (ret) {
err = ret;
goto fail_delalloc_bytes;
@@ -2233,6 +2232,7 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->super_lock);
spin_lock_init(&fs_info->qgroup_op_lock);
spin_lock_init(&fs_info->buffer_lock);
+ spin_lock_init(&fs_info->unused_bgs_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
@@ -2242,6 +2242,7 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
+ INIT_LIST_HEAD(&fs_info->unused_bgs);
btrfs_mapping_init(&fs_info->mapping_tree);
btrfs_init_block_rsv(&fs_info->global_block_rsv,
BTRFS_BLOCK_RSV_GLOBAL);
@@ -2260,7 +2261,7 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->qgroup_op_seq, 0);
atomic64_set(&fs_info->tree_mod_seq, 0);
fs_info->sb = sb;
- fs_info->max_inline = 8192 * 1024;
+ fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
fs_info->free_chunk_space = 0;
@@ -2389,7 +2390,7 @@ int open_ctree(struct super_block *sb,
goto fail_alloc;
}
- __setup_root(4096, 4096, 4096, 4096, tree_root,
+ __setup_root(4096, 4096, 4096, tree_root,
fs_info, BTRFS_ROOT_TREE_OBJECTID);
invalidate_bdev(fs_devices->latest_bdev);
@@ -2469,19 +2470,22 @@ int open_ctree(struct super_block *sb,
goto fail_alloc;
}
- if (btrfs_super_leafsize(disk_super) !=
+ /*
+ * Leafsize and nodesize were always equal, this is only a sanity check.
+ */
+ if (le32_to_cpu(disk_super->__unused_leafsize) !=
btrfs_super_nodesize(disk_super)) {
printk(KERN_ERR "BTRFS: couldn't mount because metadata "
"blocksizes don't match. node %d leaf %d\n",
btrfs_super_nodesize(disk_super),
- btrfs_super_leafsize(disk_super));
+ le32_to_cpu(disk_super->__unused_leafsize));
err = -EINVAL;
goto fail_alloc;
}
- if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
+ if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
printk(KERN_ERR "BTRFS: couldn't mount because metadata "
"blocksize (%d) was too large\n",
- btrfs_super_leafsize(disk_super));
+ btrfs_super_nodesize(disk_super));
err = -EINVAL;
goto fail_alloc;
}
@@ -2498,17 +2502,16 @@ int open_ctree(struct super_block *sb,
* flag our filesystem as having big metadata blocks if
* they are bigger than the page size
*/
- if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) {
+ if (btrfs_super_nodesize(disk_super) > PAGE_CACHE_SIZE) {
if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n");
features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
}
nodesize = btrfs_super_nodesize(disk_super);
- leafsize = btrfs_super_leafsize(disk_super);
sectorsize = btrfs_super_sectorsize(disk_super);
stripesize = btrfs_super_stripesize(disk_super);
- fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids));
+ fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
/*
@@ -2516,7 +2519,7 @@ int open_ctree(struct super_block *sb,
* extent buffers for the same range. It leads to corruptions
*/
if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
- (sectorsize != leafsize)) {
+ (sectorsize != nodesize)) {
printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes "
"are not allowed for mixed block groups on %s\n",
sb->s_id);
@@ -2579,6 +2582,8 @@ int open_ctree(struct super_block *sb,
btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
fs_info->endio_raid56_workers =
btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
+ fs_info->endio_repair_workers =
+ btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
fs_info->rmw_workers =
btrfs_alloc_workqueue("rmw", flags, max_active, 2);
fs_info->endio_write_workers =
@@ -2600,11 +2605,12 @@ int open_ctree(struct super_block *sb,
fs_info->submit_workers && fs_info->flush_workers &&
fs_info->endio_workers && fs_info->endio_meta_workers &&
fs_info->endio_meta_write_workers &&
+ fs_info->endio_repair_workers &&
fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
fs_info->caching_workers && fs_info->readahead_workers &&
fs_info->fixup_workers && fs_info->delayed_workers &&
- fs_info->fixup_workers && fs_info->extent_workers &&
+ fs_info->extent_workers &&
fs_info->qgroup_rescan_workers)) {
err = -ENOMEM;
goto fail_sb_buffer;
@@ -2615,7 +2621,6 @@ int open_ctree(struct super_block *sb,
4 * 1024 * 1024 / PAGE_CACHE_SIZE);
tree_root->nodesize = nodesize;
- tree_root->leafsize = leafsize;
tree_root->sectorsize = sectorsize;
tree_root->stripesize = stripesize;
@@ -2642,16 +2647,14 @@ int open_ctree(struct super_block *sb,
goto fail_sb_buffer;
}
- blocksize = btrfs_level_size(tree_root,
- btrfs_super_chunk_root_level(disk_super));
generation = btrfs_super_chunk_root_generation(disk_super);
- __setup_root(nodesize, leafsize, sectorsize, stripesize,
- chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
+ __setup_root(nodesize, sectorsize, stripesize, chunk_root,
+ fs_info, BTRFS_CHUNK_TREE_OBJECTID);
chunk_root->node = read_tree_block(chunk_root,
btrfs_super_chunk_root(disk_super),
- blocksize, generation);
+ generation);
if (!chunk_root->node ||
!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n",
@@ -2684,13 +2687,11 @@ int open_ctree(struct super_block *sb,
}
retry_root_backup:
- blocksize = btrfs_level_size(tree_root,
- btrfs_super_root_level(disk_super));
generation = btrfs_super_generation(disk_super);
tree_root->node = read_tree_block(tree_root,
btrfs_super_root(disk_super),
- blocksize, generation);
+ generation);
if (!tree_root->node ||
!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
@@ -2859,9 +2860,6 @@ retry_root_backup:
err = -EIO;
goto fail_qgroup;
}
- blocksize =
- btrfs_level_size(tree_root,
- btrfs_super_log_root_level(disk_super));
log_tree_root = btrfs_alloc_root(fs_info);
if (!log_tree_root) {
@@ -2869,11 +2867,10 @@ retry_root_backup:
goto fail_qgroup;
}
- __setup_root(nodesize, leafsize, sectorsize, stripesize,
+ __setup_root(nodesize, sectorsize, stripesize,
log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
log_tree_root->node = read_tree_block(tree_root, bytenr,
- blocksize,
generation + 1);
if (!log_tree_root->node ||
!extent_buffer_uptodate(log_tree_root->node)) {
@@ -2980,6 +2977,8 @@ retry_root_backup:
fs_info->update_uuid_tree_gen = 1;
}
+ fs_info->open = 1;
+
return 0;
fail_qgroup:
@@ -3139,7 +3138,8 @@ static int write_dev_supers(struct btrfs_device *device,
for (i = 0; i < max_mirrors; i++) {
bytenr = btrfs_sb_offset(i);
- if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+ device->commit_total_bytes)
break;
if (wait) {
@@ -3456,8 +3456,9 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
btrfs_set_stack_device_type(dev_item, dev->type);
btrfs_set_stack_device_id(dev_item, dev->devid);
btrfs_set_stack_device_total_bytes(dev_item,
- dev->disk_total_bytes);
- btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
+ dev->commit_total_bytes);
+ btrfs_set_stack_device_bytes_used(dev_item,
+ dev->commit_bytes_used);
btrfs_set_stack_device_io_align(dev_item, dev->io_align);
btrfs_set_stack_device_io_width(dev_item, dev->io_width);
btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
@@ -3532,7 +3533,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
static void free_fs_root(struct btrfs_root *root)
{
- iput(root->cache_inode);
+ iput(root->ino_cache_inode);
WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
btrfs_free_block_rsv(root, root->orphan_block_rsv);
root->orphan_block_rsv = NULL;
@@ -3623,7 +3624,7 @@ int btrfs_commit_super(struct btrfs_root *root)
return btrfs_commit_transaction(trans, root);
}
-int close_ctree(struct btrfs_root *root)
+void close_ctree(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
@@ -3689,6 +3690,7 @@ int close_ctree(struct btrfs_root *root)
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
btrfs_stop_all_workers(fs_info);
+ fs_info->open = 0;
free_root_pointers(fs_info, 1);
iput(fs_info->btree_inode);
@@ -3711,8 +3713,6 @@ int close_ctree(struct btrfs_root *root)
btrfs_free_block_rsv(root, root->orphan_block_rsv);
root->orphan_block_rsv = NULL;
-
- return 0;
}
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
@@ -3814,10 +3814,73 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only)
{
+ struct btrfs_super_block *sb = fs_info->super_copy;
+ int ret = 0;
+
+ if (sb->root_level > BTRFS_MAX_LEVEL) {
+ printk(KERN_ERR "BTRFS: tree_root level too big: %d > %d\n",
+ sb->root_level, BTRFS_MAX_LEVEL);
+ ret = -EINVAL;
+ }
+ if (sb->chunk_root_level > BTRFS_MAX_LEVEL) {
+ printk(KERN_ERR "BTRFS: chunk_root level too big: %d > %d\n",
+ sb->chunk_root_level, BTRFS_MAX_LEVEL);
+ ret = -EINVAL;
+ }
+ if (sb->log_root_level > BTRFS_MAX_LEVEL) {
+ printk(KERN_ERR "BTRFS: log_root level too big: %d > %d\n",
+ sb->log_root_level, BTRFS_MAX_LEVEL);
+ ret = -EINVAL;
+ }
+
/*
- * Placeholder for checks
+ * The common minimum, we don't know if we can trust the nodesize/sectorsize
+ * items yet, they'll be verified later. Issue just a warning.
*/
- return 0;
+ if (!IS_ALIGNED(sb->root, 4096))
+ printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
+ sb->root);
+ if (!IS_ALIGNED(sb->chunk_root, 4096))
+ printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
+ sb->chunk_root);
+ if (!IS_ALIGNED(sb->log_root, 4096))
+ printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
+ sb->log_root);
+
+ if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
+ printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n",
+ fs_info->fsid, sb->dev_item.fsid);
+ ret = -EINVAL;
+ }
+
+ /*
+ * Hint to catch really bogus numbers, bitflips or so, more exact checks are
+ * done later
+ */
+ if (sb->num_devices > (1UL << 31))
+ printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
+ sb->num_devices);
+
+ if (sb->bytenr != BTRFS_SUPER_INFO_OFFSET) {
+ printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n",
+ sb->bytenr, BTRFS_SUPER_INFO_OFFSET);
+ ret = -EINVAL;
+ }
+
+ /*
+ * The generation is a global counter, we'll trust it more than the others
+ * but it's still possible that it's the one that's wrong.
+ */
+ if (sb->generation < sb->chunk_root_generation)
+ printk(KERN_WARNING
+ "BTRFS: suspicious: generation < chunk_root_generation: %llu < %llu\n",
+ sb->generation, sb->chunk_root_generation);
+ if (sb->generation < sb->cache_generation && sb->cache_generation != (u64)-1)
+ printk(KERN_WARNING
+ "BTRFS: suspicious: generation < cache_generation: %llu < %llu\n",
+ sb->generation, sb->cache_generation);
+
+ return ret;
}
static void btrfs_error_commit_super(struct btrfs_root *root)
@@ -4009,9 +4072,8 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
while (start <= end) {
- eb = btrfs_find_tree_block(root, start,
- root->leafsize);
- start += root->leafsize;
+ eb = btrfs_find_tree_block(root, start);
+ start += root->nodesize;
if (!eb)
continue;
wait_on_extent_buffer_writeback(eb);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 23ce3ceba0a9..414651821fb3 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -25,11 +25,12 @@
#define BTRFS_SUPER_MIRROR_MAX 3
#define BTRFS_SUPER_MIRROR_SHIFT 12
-enum {
+enum btrfs_wq_endio_type {
BTRFS_WQ_ENDIO_DATA = 0,
BTRFS_WQ_ENDIO_METADATA = 1,
BTRFS_WQ_ENDIO_FREE_SPACE = 2,
BTRFS_WQ_ENDIO_RAID56 = 3,
+ BTRFS_WQ_ENDIO_DIO_REPAIR = 4,
};
static inline u64 btrfs_sb_offset(int mirror)
@@ -44,9 +45,8 @@ struct btrfs_device;
struct btrfs_fs_devices;
struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
- u32 blocksize, u64 parent_transid);
-int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
- u64 parent_transid);
+ u64 parent_transid);
+void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize);
int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
int mirror_num, struct extent_buffer **eb);
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
@@ -56,13 +56,13 @@ void clean_tree_block(struct btrfs_trans_handle *trans,
int open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options);
-int close_ctree(struct btrfs_root *root);
+void close_ctree(struct btrfs_root *root);
int write_ctree_super(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int max_mirrors);
struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
int btrfs_commit_super(struct btrfs_root *root);
struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
- u64 bytenr, u32 blocksize);
+ u64 bytenr);
struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
struct btrfs_key *location);
int btrfs_init_fs_root(struct btrfs_root *root);
@@ -119,7 +119,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
u32 btrfs_csum_data(char *data, u32 seed, size_t len);
void btrfs_csum_final(u32 crc, char *result);
int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
- int metadata);
+ enum btrfs_wq_endio_type metadata);
int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
int rw, struct bio *bio, int mirror_num,
unsigned long bio_flags, u64 bio_offset,
@@ -141,6 +141,8 @@ int btree_lock_page_hook(struct page *page, void *data,
void (*flush_fn)(void *));
int btrfs_calc_num_tolerated_disk_barrier_failures(
struct btrfs_fs_info *fs_info);
+int __init btrfs_end_io_wq_init(void);
+void btrfs_end_io_wq_exit(void);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 41422a3de8ed..37d164540c3a 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -70,7 +70,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
return ERR_PTR(-ESTALE);
key.objectid = root_objectid;
- btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
index = srcu_read_lock(&fs_info->subvol_srcu);
@@ -82,7 +82,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
}
key.objectid = objectid;
- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
inode = btrfs_iget(sb, &key, root, NULL);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3efe1c3877bf..d56589571012 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -491,7 +491,7 @@ next:
key.objectid);
if (key.type == BTRFS_METADATA_ITEM_KEY)
last = key.objectid +
- fs_info->tree_root->leafsize;
+ fs_info->tree_root->nodesize;
else
last = key.objectid + key.offset;
@@ -765,7 +765,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
* different
*/
if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
- offset = root->leafsize;
+ offset = root->nodesize;
metadata = 0;
}
@@ -799,13 +799,13 @@ again:
path->slots[0]);
if (key.objectid == bytenr &&
key.type == BTRFS_EXTENT_ITEM_KEY &&
- key.offset == root->leafsize)
+ key.offset == root->nodesize)
ret = 0;
}
if (ret) {
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = root->leafsize;
+ key.offset = root->nodesize;
btrfs_release_path(path);
goto again;
}
@@ -2651,7 +2651,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
num_bytes = btrfs_calc_trans_metadata_size(root, 1);
num_heads = heads_to_leaves(root, num_heads);
if (num_heads > 1)
- num_bytes += (num_heads - 1) * root->leafsize;
+ num_bytes += (num_heads - 1) * root->nodesize;
num_bytes <<= 1;
global_rsv = &root->fs_info->global_block_rsv;
@@ -3073,10 +3073,10 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
u64, u64, u64, u64, u64, u64, int);
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+
+ if (btrfs_test_is_dummy_root(root))
return 0;
-#endif
+
ref_root = btrfs_header_owner(buf);
nritems = btrfs_header_nritems(buf);
level = btrfs_header_level(buf);
@@ -3097,7 +3097,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
for (i = 0; i < nritems; i++) {
if (level == 0) {
btrfs_item_key_to_cpu(buf, &key, i);
- if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
continue;
fi = btrfs_item_ptr(buf, i,
struct btrfs_file_extent_item);
@@ -3117,7 +3117,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
goto fail;
} else {
bytenr = btrfs_node_blockptr(buf, i);
- num_bytes = btrfs_level_size(root, level - 1);
+ num_bytes = root->nodesize;
ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, level - 1, 0,
1);
@@ -3494,7 +3494,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
if (!found)
return -ENOMEM;
- ret = percpu_counter_init(&found->total_bytes_pinned, 0);
+ ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL);
if (ret) {
kfree(found);
return ret;
@@ -4343,11 +4343,21 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
}
static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
- struct btrfs_fs_info *fs_info)
+ struct btrfs_fs_info *fs_info,
+ int flush_state)
{
u64 used;
spin_lock(&space_info->lock);
+ /*
+ * We run out of space and have not got any free space via flush_space,
+ * so don't bother doing async reclaim.
+ */
+ if (flush_state > COMMIT_TRANS && space_info->full) {
+ spin_unlock(&space_info->lock);
+ return 0;
+ }
+
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_pinned + space_info->bytes_readonly +
space_info->bytes_may_use;
@@ -4380,11 +4390,12 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
flush_space(fs_info->fs_root, space_info, to_reclaim,
to_reclaim, flush_state);
flush_state++;
- if (!btrfs_need_do_async_reclaim(space_info, fs_info))
+ if (!btrfs_need_do_async_reclaim(space_info, fs_info,
+ flush_state))
return;
} while (flush_state <= COMMIT_TRANS);
- if (btrfs_need_do_async_reclaim(space_info, fs_info))
+ if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state))
queue_work(system_unbound_wq, work);
}
@@ -4502,7 +4513,13 @@ again:
space_info->flush = 1;
} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
used += orig_bytes;
- if (need_do_async_reclaim(space_info, root->fs_info, used) &&
+ /*
+ * We will do the space reservation dance during log replay,
+ * which means we won't have fs_info->fs_root set, so don't do
+ * the async reclaim as we will panic.
+ */
+ if (!root->fs_info->log_root_recovering &&
+ need_do_async_reclaim(space_info, root->fs_info, used) &&
!work_busy(&root->fs_info->async_reclaim_work))
queue_work(system_unbound_wq,
&root->fs_info->async_reclaim_work);
@@ -4839,7 +4856,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
if (num_bytes * 3 > meta_used)
num_bytes = div64_u64(meta_used, 3);
- return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
+ return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
}
static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
@@ -4988,7 +5005,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (root->fs_info->quota_enabled) {
/* One for parent inode, two for dir entries */
- num_bytes = 3 * root->leafsize;
+ num_bytes = 3 * root->nodesize;
ret = btrfs_qgroup_reserve(root, num_bytes);
if (ret)
return ret;
@@ -5176,7 +5193,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
if (root->fs_info->quota_enabled) {
ret = btrfs_qgroup_reserve(root, num_bytes +
- nr_extents * root->leafsize);
+ nr_extents * root->nodesize);
if (ret)
goto out_fail;
}
@@ -5185,7 +5202,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
if (unlikely(ret)) {
if (root->fs_info->quota_enabled)
btrfs_qgroup_free(root, num_bytes +
- nr_extents * root->leafsize);
+ nr_extents * root->nodesize);
goto out_fail;
}
@@ -5301,7 +5318,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
btrfs_ino(inode), to_free, 0);
if (root->fs_info->quota_enabled) {
btrfs_qgroup_free(root, num_bytes +
- dropped * root->leafsize);
+ dropped * root->nodesize);
}
btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
@@ -5422,6 +5439,20 @@ static int update_block_group(struct btrfs_root *root,
spin_unlock(&cache->space_info->lock);
} else {
old_val -= num_bytes;
+
+ /*
+ * No longer have used bytes in this block group, queue
+ * it for deletion.
+ */
+ if (old_val == 0) {
+ spin_lock(&info->unused_bgs_lock);
+ if (list_empty(&cache->bg_list)) {
+ btrfs_get_block_group(cache);
+ list_add_tail(&cache->bg_list,
+ &info->unused_bgs);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+ }
btrfs_set_block_group_used(&cache->item, old_val);
cache->pinned += num_bytes;
cache->space_info->bytes_pinned += num_bytes;
@@ -6233,10 +6264,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ if (btrfs_test_is_dummy_root(root))
return 0;
-#endif
+
add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
/*
@@ -6263,14 +6293,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return ret;
}
-static u64 stripe_align(struct btrfs_root *root,
- struct btrfs_block_group_cache *cache,
- u64 val, u64 num_bytes)
-{
- u64 ret = ALIGN(val, root->stripesize);
- return ret;
-}
-
/*
* when we wait for progress in the block group caching, its because
* our allocation attempt failed at least once. So, we must sleep
@@ -6464,7 +6486,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
bool have_caching_bg = false;
WARN_ON(num_bytes < root->sectorsize);
- btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
+ ins->type = BTRFS_EXTENT_ITEM_KEY;
ins->objectid = 0;
ins->offset = 0;
@@ -6751,8 +6773,7 @@ unclustered_alloc:
goto loop;
}
checks:
- search_start = stripe_align(root, block_group,
- offset, num_bytes);
+ search_start = ALIGN(offset, root->stripesize);
/* move on to the next group */
if (search_start + num_bytes >
@@ -7077,7 +7098,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path) {
btrfs_free_and_pin_reserved_extent(root, ins->objectid,
- root->leafsize);
+ root->nodesize);
return -ENOMEM;
}
@@ -7086,7 +7107,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
ins, size);
if (ret) {
btrfs_free_and_pin_reserved_extent(root, ins->objectid,
- root->leafsize);
+ root->nodesize);
btrfs_free_path(path);
return ret;
}
@@ -7101,7 +7122,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
if (skinny_metadata) {
iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
- num_bytes = root->leafsize;
+ num_bytes = root->nodesize;
} else {
block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
btrfs_set_tree_block_key(leaf, block_info, key);
@@ -7131,14 +7152,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
return ret;
}
- ret = update_block_group(root, ins->objectid, root->leafsize, 1);
+ ret = update_block_group(root, ins->objectid, root->nodesize, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
ins->objectid, ins->offset);
BUG();
}
- trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize);
+ trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize);
return ret;
}
@@ -7213,17 +7234,19 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_set_buffer_uptodate(buf);
if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ buf->log_index = root->log_transid % 2;
/*
* we allow two log transactions at a time, use different
* EXENT bit to differentiate dirty pages.
*/
- if (root->log_transid % 2 == 0)
+ if (buf->log_index == 0)
set_extent_dirty(&root->dirty_log_pages, buf->start,
buf->start + buf->len - 1, GFP_NOFS);
else
set_extent_new(&root->dirty_log_pages, buf->start,
buf->start + buf->len - 1, GFP_NOFS);
} else {
+ buf->log_index = -1;
set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
buf->start + buf->len - 1, GFP_NOFS);
}
@@ -7300,8 +7323,8 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
*
* returns the tree buffer or NULL.
*/
-struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u32 blocksize,
+struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
u64 parent, u64 root_objectid,
struct btrfs_disk_key *key, int level,
u64 hint, u64 empty_size)
@@ -7311,18 +7334,18 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
struct extent_buffer *buf;
u64 flags = 0;
int ret;
+ u32 blocksize = root->nodesize;
bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
SKINNY_METADATA);
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) {
+ if (btrfs_test_is_dummy_root(root)) {
buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
blocksize, level);
if (!IS_ERR(buf))
root->alloc_bytenr += blocksize;
return buf;
}
-#endif
+
block_rsv = use_block_rsv(trans, root, blocksize);
if (IS_ERR(block_rsv))
return ERR_CAST(block_rsv);
@@ -7417,7 +7440,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
eb = path->nodes[wc->level];
nritems = btrfs_header_nritems(eb);
- blocksize = btrfs_level_size(root, wc->level - 1);
+ blocksize = root->nodesize;
for (slot = path->slots[wc->level]; slot < nritems; slot++) {
if (nread >= wc->reada_count)
@@ -7464,10 +7487,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
continue;
}
reada:
- ret = readahead_tree_block(root, bytenr, blocksize,
- generation);
- if (ret)
- break;
+ readahead_tree_block(root, bytenr, blocksize);
nread++;
}
wc->reada_slot = slot;
@@ -7626,7 +7646,6 @@ walk_down:
level = root_level;
while (level >= 0) {
if (path->nodes[level] == NULL) {
- int child_bsize = root->nodesize;
int parent_slot;
u64 child_gen;
u64 child_bytenr;
@@ -7638,8 +7657,7 @@ walk_down:
child_bytenr = btrfs_node_blockptr(eb, parent_slot);
child_gen = btrfs_node_ptr_generation(eb, parent_slot);
- eb = read_tree_block(root, child_bytenr, child_bsize,
- child_gen);
+ eb = read_tree_block(root, child_bytenr, child_gen);
if (!eb || !extent_buffer_uptodate(eb)) {
ret = -EIO;
goto out;
@@ -7655,7 +7673,7 @@ walk_down:
ret = btrfs_qgroup_record_ref(trans, root->fs_info,
root->objectid,
child_bytenr,
- child_bsize,
+ root->nodesize,
BTRFS_QGROUP_OPER_SUB_SUBTREE,
0);
if (ret)
@@ -7806,9 +7824,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
}
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
- blocksize = btrfs_level_size(root, level - 1);
+ blocksize = root->nodesize;
- next = btrfs_find_tree_block(root, bytenr, blocksize);
+ next = btrfs_find_tree_block(root, bytenr);
if (!next) {
next = btrfs_find_create_tree_block(root, bytenr, blocksize);
if (!next)
@@ -7870,7 +7888,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (!next) {
if (reada && level == 1)
reada_walk_down(trans, root, wc, path);
- next = read_tree_block(root, bytenr, blocksize, generation);
+ next = read_tree_block(root, bytenr, generation);
if (!next || !extent_buffer_uptodate(next)) {
free_extent_buffer(next);
return -EIO;
@@ -8853,6 +8871,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
}
up_write(&info->commit_root_sem);
+ spin_lock(&info->unused_bgs_lock);
+ while (!list_empty(&info->unused_bgs)) {
+ block_group = list_first_entry(&info->unused_bgs,
+ struct btrfs_block_group_cache,
+ bg_list);
+ list_del_init(&block_group->bg_list);
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+
spin_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
block_group = rb_entry(n, struct btrfs_block_group_cache,
@@ -8987,7 +9015,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
init_rwsem(&cache->data_rwsem);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
- INIT_LIST_HEAD(&cache->new_bg_list);
+ INIT_LIST_HEAD(&cache->bg_list);
btrfs_init_free_space_ctl(cache);
return cache;
@@ -9009,7 +9037,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
root = info->extent_root;
key.objectid = 0;
key.offset = 0;
- btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
+ key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -9128,8 +9156,18 @@ int btrfs_read_block_groups(struct btrfs_root *root)
__link_block_group(space_info, cache);
set_avail_alloc_bits(root->fs_info, cache->flags);
- if (btrfs_chunk_readonly(root, cache->key.objectid))
+ if (btrfs_chunk_readonly(root, cache->key.objectid)) {
set_block_group_ro(cache, 1);
+ } else if (btrfs_block_group_used(&cache->item) == 0) {
+ spin_lock(&info->unused_bgs_lock);
+ /* Should always be true but just in case. */
+ if (list_empty(&cache->bg_list)) {
+ btrfs_get_block_group(cache);
+ list_add_tail(&cache->bg_list,
+ &info->unused_bgs);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+ }
}
list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
@@ -9170,10 +9208,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_key key;
int ret = 0;
- list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
- new_bg_list) {
- list_del_init(&block_group->new_bg_list);
-
+ list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
+ list_del_init(&block_group->bg_list);
if (ret)
continue;
@@ -9259,7 +9295,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
__link_block_group(cache->space_info, cache);
- list_add_tail(&cache->new_bg_list, &trans->new_bgs);
+ list_add_tail(&cache->bg_list, &trans->new_bgs);
set_avail_alloc_bits(extent_root->fs_info, type);
@@ -9413,8 +9449,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
memcpy(&key, &block_group->key, sizeof(key));
- btrfs_clear_space_info_full(root->fs_info);
-
btrfs_put_block_group(block_group);
btrfs_put_block_group(block_group);
@@ -9430,6 +9464,101 @@ out:
return ret;
}
+/*
+ * Process the unused_bgs list and remove any that don't have any allocated
+ * space inside of them.
+ */
+void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_space_info *space_info;
+ struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+
+ if (!fs_info->open)
+ return;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ while (!list_empty(&fs_info->unused_bgs)) {
+ u64 start, end;
+
+ block_group = list_first_entry(&fs_info->unused_bgs,
+ struct btrfs_block_group_cache,
+ bg_list);
+ space_info = block_group->space_info;
+ list_del_init(&block_group->bg_list);
+ if (ret || btrfs_mixed_space_info(space_info)) {
+ btrfs_put_block_group(block_group);
+ continue;
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ /* Don't want to race with allocators so take the groups_sem */
+ down_write(&space_info->groups_sem);
+ spin_lock(&block_group->lock);
+ if (block_group->reserved ||
+ btrfs_block_group_used(&block_group->item) ||
+ block_group->ro) {
+ /*
+ * We want to bail if we made new allocations or have
+ * outstanding allocations in this block group. We do
+ * the ro check in case balance is currently acting on
+ * this block group.
+ */
+ spin_unlock(&block_group->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+ spin_unlock(&block_group->lock);
+
+ /* We don't want to force the issue, only flip if it's ok. */
+ ret = set_block_group_ro(block_group, 0);
+ up_write(&space_info->groups_sem);
+ if (ret < 0) {
+ ret = 0;
+ goto next;
+ }
+
+ /*
+ * Want to do this before we do anything else so we can recover
+ * properly if we fail to join the transaction.
+ */
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ btrfs_set_block_group_rw(root, block_group);
+ ret = PTR_ERR(trans);
+ goto next;
+ }
+
+ /*
+ * We could have pending pinned extents for this block group,
+ * just delete them, we don't care about them anymore.
+ */
+ start = block_group->key.objectid;
+ end = start + block_group->key.offset - 1;
+ clear_extent_bits(&fs_info->freed_extents[0], start, end,
+ EXTENT_DIRTY, GFP_NOFS);
+ clear_extent_bits(&fs_info->freed_extents[1], start, end,
+ EXTENT_DIRTY, GFP_NOFS);
+
+ /* Reset pinned so btrfs_put_block_group doesn't complain */
+ block_group->pinned = 0;
+
+ /*
+ * Btrfs_remove_chunk will abort the transaction if things go
+ * horribly wrong.
+ */
+ ret = btrfs_remove_chunk(trans, root,
+ block_group->key.objectid);
+ btrfs_end_transaction(trans, root);
+next:
+ btrfs_put_block_group(block_group);
+ spin_lock(&fs_info->unused_bgs_lock);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
+
int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
{
struct btrfs_space_info *space_info;
@@ -9561,7 +9690,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root)
int btrfs_start_nocow_write(struct btrfs_root *root)
{
- if (unlikely(atomic_read(&root->will_be_snapshoted)))
+ if (atomic_read(&root->will_be_snapshoted))
return 0;
percpu_counter_inc(&root->subv_writers->counter);
@@ -9569,7 +9698,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root)
* Make sure counter is updated before we check for snapshot creation.
*/
smp_mb();
- if (unlikely(atomic_read(&root->will_be_snapshoted))) {
+ if (atomic_read(&root->will_be_snapshoted)) {
btrfs_end_nocow_write(root);
return 0;
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index af0359dcf337..bf3f424e0013 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -25,6 +25,11 @@ static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
static struct bio_set *btrfs_bioset;
+static inline bool extent_state_in_tree(const struct extent_state *state)
+{
+ return !RB_EMPTY_NODE(&state->rb_node);
+}
+
#ifdef CONFIG_BTRFS_DEBUG
static LIST_HEAD(buffers);
static LIST_HEAD(states);
@@ -59,9 +64,9 @@ void btrfs_leak_debug_check(void)
while (!list_empty(&states)) {
state = list_entry(states.next, struct extent_state, leak_list);
- printk(KERN_ERR "BTRFS: state leak: start %llu end %llu "
- "state %lu in tree %p refs %d\n",
- state->start, state->end, state->state, state->tree,
+ pr_err("BTRFS: state leak: start %llu end %llu state %lu in tree %d refs %d\n",
+ state->start, state->end, state->state,
+ extent_state_in_tree(state),
atomic_read(&state->refs));
list_del(&state->leak_list);
kmem_cache_free(extent_state_cache, state);
@@ -209,7 +214,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
return state;
state->state = 0;
state->private = 0;
- state->tree = NULL;
+ RB_CLEAR_NODE(&state->rb_node);
btrfs_leak_debug_add(&state->leak_list, &states);
atomic_set(&state->refs, 1);
init_waitqueue_head(&state->wq);
@@ -222,7 +227,7 @@ void free_extent_state(struct extent_state *state)
if (!state)
return;
if (atomic_dec_and_test(&state->refs)) {
- WARN_ON(state->tree);
+ WARN_ON(extent_state_in_tree(state));
btrfs_leak_debug_del(&state->leak_list);
trace_free_extent_state(state, _RET_IP_);
kmem_cache_free(extent_state_cache, state);
@@ -371,8 +376,8 @@ static void merge_state(struct extent_io_tree *tree,
other->state == state->state) {
merge_cb(tree, state, other);
state->start = other->start;
- other->tree = NULL;
rb_erase(&other->rb_node, &tree->state);
+ RB_CLEAR_NODE(&other->rb_node);
free_extent_state(other);
}
}
@@ -383,8 +388,8 @@ static void merge_state(struct extent_io_tree *tree,
other->state == state->state) {
merge_cb(tree, state, other);
state->end = other->end;
- other->tree = NULL;
rb_erase(&other->rb_node, &tree->state);
+ RB_CLEAR_NODE(&other->rb_node);
free_extent_state(other);
}
}
@@ -442,7 +447,6 @@ static int insert_state(struct extent_io_tree *tree,
found->start, found->end, start, end);
return -EEXIST;
}
- state->tree = tree;
merge_state(tree, state);
return 0;
}
@@ -486,7 +490,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
free_extent_state(prealloc);
return -EEXIST;
}
- prealloc->tree = tree;
return 0;
}
@@ -524,9 +527,9 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
wake_up(&state->wq);
if (state->state == 0) {
next = next_state(state);
- if (state->tree) {
+ if (extent_state_in_tree(state)) {
rb_erase(&state->rb_node, &tree->state);
- state->tree = NULL;
+ RB_CLEAR_NODE(&state->rb_node);
free_extent_state(state);
} else {
WARN_ON(1);
@@ -606,8 +609,8 @@ again:
cached_state = NULL;
}
- if (cached && cached->tree && cached->start <= start &&
- cached->end > start) {
+ if (cached && extent_state_in_tree(cached) &&
+ cached->start <= start && cached->end > start) {
if (clear)
atomic_dec(&cached->refs);
state = cached;
@@ -843,7 +846,7 @@ again:
if (cached_state && *cached_state) {
state = *cached_state;
if (state->start <= start && state->end > start &&
- state->tree) {
+ extent_state_in_tree(state)) {
node = &state->rb_node;
goto hit_next;
}
@@ -1069,7 +1072,7 @@ again:
if (cached_state && *cached_state) {
state = *cached_state;
if (state->start <= start && state->end > start &&
- state->tree) {
+ extent_state_in_tree(state)) {
node = &state->rb_node;
goto hit_next;
}
@@ -1459,7 +1462,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
spin_lock(&tree->lock);
if (cached_state && *cached_state) {
state = *cached_state;
- if (state->end == start - 1 && state->tree) {
+ if (state->end == start - 1 && extent_state_in_tree(state)) {
n = rb_next(&state->rb_node);
while (n) {
state = rb_entry(n, struct extent_state,
@@ -1905,7 +1908,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
int bitset = 0;
spin_lock(&tree->lock);
- if (cached && cached->tree && cached->start <= start &&
+ if (cached && extent_state_in_tree(cached) && cached->start <= start &&
cached->end > start)
node = &cached->rb_node;
else
@@ -1959,27 +1962,7 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
SetPageUptodate(page);
}
-/*
- * When IO fails, either with EIO or csum verification fails, we
- * try other mirrors that might have a good copy of the data. This
- * io_failure_record is used to record state as we go through all the
- * mirrors. If another mirror has good data, the page is set up to date
- * and things continue. If a good mirror can't be found, the original
- * bio end_io callback is called to indicate things have failed.
- */
-struct io_failure_record {
- struct page *page;
- u64 start;
- u64 len;
- u64 logical;
- unsigned long bio_flags;
- int this_mirror;
- int failed_mirror;
- int in_validation;
-};
-
-static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
- int did_repair)
+int free_io_failure(struct inode *inode, struct io_failure_record *rec)
{
int ret;
int err = 0;
@@ -2012,10 +1995,10 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
* currently, there can be no more than two copies of every data bit. thus,
* exactly one rewrite is required.
*/
-int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
- u64 length, u64 logical, struct page *page,
- int mirror_num)
+int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
+ struct page *page, unsigned int pg_offset, int mirror_num)
{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct bio *bio;
struct btrfs_device *dev;
u64 map_length = 0;
@@ -2053,7 +2036,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
return -EIO;
}
bio->bi_bdev = dev->bdev;
- bio_add_page(bio, page, length, start - page_offset(page));
+ bio_add_page(bio, page, length, pg_offset);
if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) {
/* try to remap that extent elsewhere? */
@@ -2063,10 +2046,9 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
}
printk_ratelimited_in_rcu(KERN_INFO
- "BTRFS: read error corrected: ino %lu off %llu "
- "(dev %s sector %llu)\n", page->mapping->host->i_ino,
- start, rcu_str_deref(dev->name), sector);
-
+ "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n",
+ btrfs_ino(inode), start,
+ rcu_str_deref(dev->name), sector);
bio_put(bio);
return 0;
}
@@ -2082,9 +2064,11 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
return -EROFS;
for (i = 0; i < num_pages; i++) {
- struct page *p = extent_buffer_page(eb, i);
- ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
- start, p, mirror_num);
+ struct page *p = eb->pages[i];
+
+ ret = repair_io_failure(root->fs_info->btree_inode, start,
+ PAGE_CACHE_SIZE, start, p,
+ start - page_offset(p), mirror_num);
if (ret)
break;
start += PAGE_CACHE_SIZE;
@@ -2097,16 +2081,15 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
* each time an IO finishes, we do a fast check in the IO failure tree
* to see if we need to process or clean up an io_failure_record
*/
-static int clean_io_failure(u64 start, struct page *page)
+int clean_io_failure(struct inode *inode, u64 start, struct page *page,
+ unsigned int pg_offset)
{
u64 private;
u64 private_failure;
struct io_failure_record *failrec;
- struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct extent_state *state;
int num_copies;
- int did_repair = 0;
int ret;
private = 0;
@@ -2127,7 +2110,6 @@ static int clean_io_failure(u64 start, struct page *page)
/* there was no real error, just free the record */
pr_debug("clean_io_failure: freeing dummy error at %llu\n",
failrec->start);
- did_repair = 1;
goto out;
}
if (fs_info->sb->s_flags & MS_RDONLY)
@@ -2144,55 +2126,70 @@ static int clean_io_failure(u64 start, struct page *page)
num_copies = btrfs_num_copies(fs_info, failrec->logical,
failrec->len);
if (num_copies > 1) {
- ret = repair_io_failure(fs_info, start, failrec->len,
- failrec->logical, page,
- failrec->failed_mirror);
- did_repair = !ret;
+ repair_io_failure(inode, start, failrec->len,
+ failrec->logical, page,
+ pg_offset, failrec->failed_mirror);
}
- ret = 0;
}
out:
- if (!ret)
- ret = free_io_failure(inode, failrec, did_repair);
+ free_io_failure(inode, failrec);
- return ret;
+ return 0;
}
/*
- * this is a generic handler for readpage errors (default
- * readpage_io_failed_hook). if other copies exist, read those and write back
- * good data to the failed position. does not investigate in remapping the
- * failed extent elsewhere, hoping the device will be smart enough to do this as
- * needed
+ * Can be called when
+ * - hold extent lock
+ * - under ordered extent
+ * - the inode is freeing
*/
+void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end)
+{
+ struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+ struct io_failure_record *failrec;
+ struct extent_state *state, *next;
-static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
- struct page *page, u64 start, u64 end,
- int failed_mirror)
+ if (RB_EMPTY_ROOT(&failure_tree->state))
+ return;
+
+ spin_lock(&failure_tree->lock);
+ state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
+ while (state) {
+ if (state->start > end)
+ break;
+
+ ASSERT(state->end <= end);
+
+ next = next_state(state);
+
+ failrec = (struct io_failure_record *)state->private;
+ free_extent_state(state);
+ kfree(failrec);
+
+ state = next;
+ }
+ spin_unlock(&failure_tree->lock);
+}
+
+int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
+ struct io_failure_record **failrec_ret)
{
- struct io_failure_record *failrec = NULL;
+ struct io_failure_record *failrec;
u64 private;
struct extent_map *em;
- struct inode *inode = page->mapping->host;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
- struct bio *bio;
- struct btrfs_io_bio *btrfs_failed_bio;
- struct btrfs_io_bio *btrfs_bio;
- int num_copies;
int ret;
- int read_mode;
u64 logical;
- BUG_ON(failed_bio->bi_rw & REQ_WRITE);
-
ret = get_state_private(failure_tree, start, &private);
if (ret) {
failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
if (!failrec)
return -ENOMEM;
+
failrec->start = start;
failrec->len = end - start + 1;
failrec->this_mirror = 0;
@@ -2212,11 +2209,11 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
em = NULL;
}
read_unlock(&em_tree->lock);
-
if (!em) {
kfree(failrec);
return -EIO;
}
+
logical = start - em->start;
logical = em->block_start + logical;
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
@@ -2225,8 +2222,10 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
extent_set_compress_type(&failrec->bio_flags,
em->compress_type);
}
- pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
- "len=%llu\n", logical, start, failrec->len);
+
+ pr_debug("Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu\n",
+ logical, start, failrec->len);
+
failrec->logical = logical;
free_extent_map(em);
@@ -2246,8 +2245,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
}
} else {
failrec = (struct io_failure_record *)(unsigned long)private;
- pr_debug("bio_readpage_error: (found) logical=%llu, "
- "start=%llu, len=%llu, validation=%d\n",
+ pr_debug("Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d\n",
failrec->logical, failrec->start, failrec->len,
failrec->in_validation);
/*
@@ -2256,6 +2254,17 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
* clean_io_failure() clean all those errors at once.
*/
}
+
+ *failrec_ret = failrec;
+
+ return 0;
+}
+
+int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
+ struct io_failure_record *failrec, int failed_mirror)
+{
+ int num_copies;
+
num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
failrec->logical, failrec->len);
if (num_copies == 1) {
@@ -2264,10 +2273,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
* all the retry and error correction code that follows. no
* matter what the error is, it is very likely to persist.
*/
- pr_debug("bio_readpage_error: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
+ pr_debug("Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
num_copies, failrec->this_mirror, failed_mirror);
- free_io_failure(inode, failrec, 0);
- return -EIO;
+ return 0;
}
/*
@@ -2287,7 +2295,6 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
BUG_ON(failrec->in_validation);
failrec->in_validation = 1;
failrec->this_mirror = failed_mirror;
- read_mode = READ_SYNC | REQ_FAILFAST_DEV;
} else {
/*
* we're ready to fulfill a) and b) alongside. get a good copy
@@ -2303,25 +2310,36 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
failrec->this_mirror++;
if (failrec->this_mirror == failed_mirror)
failrec->this_mirror++;
- read_mode = READ_SYNC;
}
if (failrec->this_mirror > num_copies) {
- pr_debug("bio_readpage_error: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
+ pr_debug("Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
num_copies, failrec->this_mirror, failed_mirror);
- free_io_failure(inode, failrec, 0);
- return -EIO;
+ return 0;
}
+ return 1;
+}
+
+
+struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
+ struct io_failure_record *failrec,
+ struct page *page, int pg_offset, int icsum,
+ bio_end_io_t *endio_func, void *data)
+{
+ struct bio *bio;
+ struct btrfs_io_bio *btrfs_failed_bio;
+ struct btrfs_io_bio *btrfs_bio;
+
bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
- if (!bio) {
- free_io_failure(inode, failrec, 0);
- return -EIO;
- }
- bio->bi_end_io = failed_bio->bi_end_io;
+ if (!bio)
+ return NULL;
+
+ bio->bi_end_io = endio_func;
bio->bi_iter.bi_sector = failrec->logical >> 9;
bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
bio->bi_iter.bi_size = 0;
+ bio->bi_private = data;
btrfs_failed_bio = btrfs_io_bio(failed_bio);
if (btrfs_failed_bio->csum) {
@@ -2330,21 +2348,73 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
btrfs_bio = btrfs_io_bio(bio);
btrfs_bio->csum = btrfs_bio->csum_inline;
- phy_offset >>= inode->i_sb->s_blocksize_bits;
- phy_offset *= csum_size;
- memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + phy_offset,
+ icsum *= csum_size;
+ memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum,
csum_size);
}
- bio_add_page(bio, page, failrec->len, start - page_offset(page));
+ bio_add_page(bio, page, failrec->len, pg_offset);
+
+ return bio;
+}
+
+/*
+ * this is a generic handler for readpage errors (default
+ * readpage_io_failed_hook). if other copies exist, read those and write back
+ * good data to the failed position. does not investigate in remapping the
+ * failed extent elsewhere, hoping the device will be smart enough to do this as
+ * needed
+ */
+
+static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end,
+ int failed_mirror)
+{
+ struct io_failure_record *failrec;
+ struct inode *inode = page->mapping->host;
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+ struct bio *bio;
+ int read_mode;
+ int ret;
+
+ BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+
+ ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
+ if (ret)
+ return ret;
+
+ ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror);
+ if (!ret) {
+ free_io_failure(inode, failrec);
+ return -EIO;
+ }
+
+ if (failed_bio->bi_vcnt > 1)
+ read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+ else
+ read_mode = READ_SYNC;
+
+ phy_offset >>= inode->i_sb->s_blocksize_bits;
+ bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
+ start - page_offset(page),
+ (int)phy_offset, failed_bio->bi_end_io,
+ NULL);
+ if (!bio) {
+ free_io_failure(inode, failrec);
+ return -EIO;
+ }
- pr_debug("bio_readpage_error: submitting new read[%#x] to "
- "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
- failrec->this_mirror, num_copies, failrec->in_validation);
+ pr_debug("Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d\n",
+ read_mode, failrec->this_mirror, failrec->in_validation);
ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
failrec->this_mirror,
failrec->bio_flags, 0);
+ if (ret) {
+ free_io_failure(inode, failrec);
+ bio_put(bio);
+ }
+
return ret;
}
@@ -2469,7 +2539,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
struct inode *inode = page->mapping->host;
pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
- "mirror=%lu\n", (u64)bio->bi_iter.bi_sector, err,
+ "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err,
io_bio->mirror_num);
tree = &BTRFS_I(inode)->io_tree;
@@ -2503,7 +2573,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
if (ret)
uptodate = 0;
else
- clean_io_failure(start, page);
+ clean_io_failure(inode, start, page, 0);
}
if (likely(uptodate))
@@ -2540,12 +2610,12 @@ readpage_ok:
if (likely(uptodate)) {
loff_t i_size = i_size_read(inode);
pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
- unsigned offset;
+ unsigned off;
/* Zero out the end if this page straddles i_size */
- offset = i_size & (PAGE_CACHE_SIZE-1);
- if (page->index == end_index && offset)
- zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+ off = i_size & (PAGE_CACHE_SIZE-1);
+ if (page->index == end_index && off)
+ zero_user_segment(page, off, PAGE_CACHE_SIZE);
SetPageUptodate(page);
} else {
ClearPageUptodate(page);
@@ -2618,9 +2688,18 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
{
- return bio_clone_bioset(bio, gfp_mask, btrfs_bioset);
-}
+ struct btrfs_io_bio *btrfs_bio;
+ struct bio *new;
+ new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset);
+ if (new) {
+ btrfs_bio = btrfs_io_bio(new);
+ btrfs_bio->csum = NULL;
+ btrfs_bio->csum_allocated = NULL;
+ btrfs_bio->end_io = NULL;
+ }
+ return new;
+}
/* this also allocates from the btrfs_bioset */
struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
@@ -3501,7 +3580,7 @@ lock_extent_buffer_for_io(struct extent_buffer *eb,
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
- struct page *p = extent_buffer_page(eb, i);
+ struct page *p = eb->pages[i];
if (!trylock_page(p)) {
if (!flush) {
@@ -3522,6 +3601,68 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb)
wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
}
+static void set_btree_ioerr(struct page *page)
+{
+ struct extent_buffer *eb = (struct extent_buffer *)page->private;
+ struct btrfs_inode *btree_ino = BTRFS_I(eb->fs_info->btree_inode);
+
+ SetPageError(page);
+ if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
+ return;
+
+ /*
+ * If writeback for a btree extent that doesn't belong to a log tree
+ * failed, increment the counter transaction->eb_write_errors.
+ * We do this because while the transaction is running and before it's
+ * committing (when we call filemap_fdata[write|wait]_range against
+ * the btree inode), we might have
+ * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
+ * returns an error or an error happens during writeback, when we're
+ * committing the transaction we wouldn't know about it, since the pages
+ * can be no longer dirty nor marked anymore for writeback (if a
+ * subsequent modification to the extent buffer didn't happen before the
+ * transaction commit), which makes filemap_fdata[write|wait]_range not
+ * able to find the pages tagged with SetPageError at transaction
+ * commit time. So if this happens we must abort the transaction,
+ * otherwise we commit a super block with btree roots that point to
+ * btree nodes/leafs whose content on disk is invalid - either garbage
+ * or the content of some node/leaf from a past generation that got
+ * cowed or deleted and is no longer valid.
+ *
+ * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
+ * not be enough - we need to distinguish between log tree extents vs
+ * non-log tree extents, and the next filemap_fdatawait_range() call
+ * will catch and clear such errors in the mapping - and that call might
+ * be from a log sync and not from a transaction commit. Also, checking
+ * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
+ * not done and would not be reliable - the eb might have been released
+ * from memory and reading it back again means that flag would not be
+ * set (since it's a runtime flag, not persisted on disk).
+ *
+ * Using the flags below in the btree inode also makes us achieve the
+ * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
+ * writeback for all dirty pages and before filemap_fdatawait_range()
+ * is called, the writeback for all dirty pages had already finished
+ * with errors - because we were not using AS_EIO/AS_ENOSPC,
+ * filemap_fdatawait_range() would return success, as it could not know
+ * that writeback errors happened (the pages were no longer tagged for
+ * writeback).
+ */
+ switch (eb->log_index) {
+ case -1:
+ set_bit(BTRFS_INODE_BTREE_ERR, &btree_ino->runtime_flags);
+ break;
+ case 0:
+ set_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
+ break;
+ case 1:
+ set_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
+ break;
+ default:
+ BUG(); /* unexpected, logic error */
+ }
+}
+
static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
{
struct bio_vec *bvec;
@@ -3535,10 +3676,9 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
BUG_ON(!eb);
done = atomic_dec_and_test(&eb->io_pages);
- if (err || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
- set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ if (err || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
ClearPageUptodate(page);
- SetPageError(page);
+ set_btree_ioerr(page);
}
end_page_writeback(page);
@@ -3565,14 +3705,14 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META;
int ret = 0;
- clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
num_pages = num_extent_pages(eb->start, eb->len);
atomic_set(&eb->io_pages, num_pages);
if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
bio_flags = EXTENT_BIO_TREE_LOG;
for (i = 0; i < num_pages; i++) {
- struct page *p = extent_buffer_page(eb, i);
+ struct page *p = eb->pages[i];
clear_page_dirty_for_io(p);
set_page_writeback(p);
@@ -3582,8 +3722,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
0, epd->bio_flags, bio_flags);
epd->bio_flags = bio_flags;
if (ret) {
- set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
- SetPageError(p);
+ set_btree_ioerr(p);
+ end_page_writeback(p);
if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
end_extent_buffer_writeback(eb);
ret = -EIO;
@@ -3596,7 +3736,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
if (unlikely(ret)) {
for (; i < num_pages; i++) {
- struct page *p = extent_buffer_page(eb, i);
+ struct page *p = eb->pages[i];
+ clear_page_dirty_for_io(p);
unlock_page(p);
}
}
@@ -4166,19 +4307,6 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
return NULL;
}
-static noinline int count_ext_ref(u64 inum, u64 offset, u64 root_id, void *ctx)
-{
- unsigned long cnt = *((unsigned long *)ctx);
-
- cnt++;
- *((unsigned long *)ctx) = cnt;
-
- /* Now we're sure that the extent is shared. */
- if (cnt > 1)
- return 1;
- return 0;
-}
-
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len, get_extent_t *get_extent)
{
@@ -4195,6 +4323,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
struct btrfs_path *path;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
int end = 0;
u64 em_start = 0;
u64 em_len = 0;
@@ -4215,8 +4344,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* lookup the last file extent. We're not using i_size here
* because there might be preallocation past i_size
*/
- ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
- path, btrfs_ino(inode), -1, 0);
+ ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
+ 0);
if (ret < 0) {
btrfs_free_path(path);
return ret;
@@ -4224,7 +4353,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
WARN_ON(!ret);
path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
- found_type = btrfs_key_type(&found_key);
+ found_type = found_key.type;
/* No extents, but there might be delalloc bits */
if (found_key.objectid != btrfs_ino(inode) ||
@@ -4309,25 +4438,27 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
} else if (em->block_start == EXTENT_MAP_DELALLOC) {
flags |= (FIEMAP_EXTENT_DELALLOC |
FIEMAP_EXTENT_UNKNOWN);
- } else {
- unsigned long ref_cnt = 0;
+ } else if (fieinfo->fi_extents_max) {
+ u64 bytenr = em->block_start -
+ (em->start - em->orig_start);
disko = em->block_start + offset_in_extent;
/*
* As btrfs supports shared space, this information
* can be exported to userspace tools via
- * flag FIEMAP_EXTENT_SHARED.
+ * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0
+ * then we're just getting a count and we can skip the
+ * lookup stuff.
*/
- ret = iterate_inodes_from_logical(
- em->block_start,
- BTRFS_I(inode)->root->fs_info,
- path, count_ext_ref, &ref_cnt);
- if (ret < 0 && ret != -ENOENT)
+ ret = btrfs_check_shared(NULL, root->fs_info,
+ root->objectid,
+ btrfs_ino(inode), bytenr);
+ if (ret < 0)
goto out_free;
-
- if (ref_cnt > 1)
+ if (ret)
flags |= FIEMAP_EXTENT_SHARED;
+ ret = 0;
}
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
flags |= FIEMAP_EXTENT_ENCODED;
@@ -4381,24 +4512,21 @@ int extent_buffer_under_io(struct extent_buffer *eb)
/*
* Helper for releasing extent buffer page.
*/
-static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
- unsigned long start_idx)
+static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
{
unsigned long index;
- unsigned long num_pages;
struct page *page;
int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
BUG_ON(extent_buffer_under_io(eb));
- num_pages = num_extent_pages(eb->start, eb->len);
- index = start_idx + num_pages;
- if (start_idx >= index)
+ index = num_extent_pages(eb->start, eb->len);
+ if (index == 0)
return;
do {
index--;
- page = extent_buffer_page(eb, index);
+ page = eb->pages[index];
if (page && mapped) {
spin_lock(&page->mapping->private_lock);
/*
@@ -4429,7 +4557,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
/* One for when we alloced the page */
page_cache_release(page);
}
- } while (index != start_idx);
+ } while (index != 0);
}
/*
@@ -4437,7 +4565,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
*/
static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
{
- btrfs_release_extent_buffer_page(eb, 0);
+ btrfs_release_extent_buffer_page(eb);
__free_extent_buffer(eb);
}
@@ -4580,7 +4708,8 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb,
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
- struct page *p = extent_buffer_page(eb, i);
+ struct page *p = eb->pages[i];
+
if (p != accessed)
mark_page_accessed(p);
}
@@ -4749,7 +4878,7 @@ again:
*/
SetPageChecked(eb->pages[0]);
for (i = 1; i < num_pages; i++) {
- p = extent_buffer_page(eb, i);
+ p = eb->pages[i];
ClearPageChecked(p);
unlock_page(p);
}
@@ -4794,7 +4923,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
}
/* Should be safe to release our pages at this point */
- btrfs_release_extent_buffer_page(eb, 0);
+ btrfs_release_extent_buffer_page(eb);
call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
return 1;
}
@@ -4860,7 +4989,7 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb)
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
if (!PageDirty(page))
continue;
@@ -4896,7 +5025,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb)
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
for (i = 0; i < num_pages; i++)
- set_page_dirty(extent_buffer_page(eb, i));
+ set_page_dirty(eb->pages[i]);
return was_dirty;
}
@@ -4909,7 +5038,7 @@ int clear_extent_buffer_uptodate(struct extent_buffer *eb)
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
if (page)
ClearPageUptodate(page);
}
@@ -4925,7 +5054,7 @@ int set_extent_buffer_uptodate(struct extent_buffer *eb)
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
SetPageUptodate(page);
}
return 0;
@@ -4965,7 +5094,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
num_pages = num_extent_pages(eb->start, eb->len);
for (i = start_i; i < num_pages; i++) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
if (wait == WAIT_NONE) {
if (!trylock_page(page))
goto unlock_exit;
@@ -4984,11 +5113,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
goto unlock_exit;
}
- clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = 0;
atomic_set(&eb->io_pages, num_reads);
for (i = start_i; i < num_pages; i++) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
if (!PageUptodate(page)) {
ClearPageError(page);
err = __extent_read_full_page(tree, page,
@@ -5013,7 +5142,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
return ret;
for (i = start_i; i < num_pages; i++) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
wait_on_page_locked(page);
if (!PageUptodate(page))
ret = -EIO;
@@ -5024,7 +5153,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
unlock_exit:
i = start_i;
while (locked_pages > 0) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
i++;
unlock_page(page);
locked_pages--;
@@ -5050,7 +5179,7 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
while (len > 0) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
cur = min(len, (PAGE_CACHE_SIZE - offset));
kaddr = page_address(page);
@@ -5082,7 +5211,7 @@ int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
while (len > 0) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
cur = min(len, (PAGE_CACHE_SIZE - offset));
kaddr = page_address(page);
@@ -5131,7 +5260,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
return -EINVAL;
}
- p = extent_buffer_page(eb, i);
+ p = eb->pages[i];
kaddr = page_address(p);
*map = kaddr + offset;
*map_len = PAGE_CACHE_SIZE - offset;
@@ -5157,7 +5286,7 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
while (len > 0) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
cur = min(len, (PAGE_CACHE_SIZE - offset));
@@ -5191,7 +5320,7 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
while (len > 0) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
WARN_ON(!PageUptodate(page));
cur = min(len, PAGE_CACHE_SIZE - offset);
@@ -5221,7 +5350,7 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
while (len > 0) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
WARN_ON(!PageUptodate(page));
cur = min(len, PAGE_CACHE_SIZE - offset);
@@ -5252,7 +5381,7 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
(PAGE_CACHE_SIZE - 1);
while (len > 0) {
- page = extent_buffer_page(dst, i);
+ page = dst->pages[i];
WARN_ON(!PageUptodate(page));
cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
@@ -5330,8 +5459,7 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
cur = min_t(unsigned long, cur,
(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
- copy_pages(extent_buffer_page(dst, dst_i),
- extent_buffer_page(dst, src_i),
+ copy_pages(dst->pages[dst_i], dst->pages[src_i],
dst_off_in_page, src_off_in_page, cur);
src_offset += cur;
@@ -5377,8 +5505,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
cur = min_t(unsigned long, len, src_off_in_page + 1);
cur = min(cur, dst_off_in_page + 1);
- copy_pages(extent_buffer_page(dst, dst_i),
- extent_buffer_page(dst, src_i),
+ copy_pages(dst->pages[dst_i], dst->pages[src_i],
dst_off_in_page - cur + 1,
src_off_in_page - cur + 1, cur);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index ccc264e7bde1..6d4b938be986 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -11,8 +11,6 @@
#define EXTENT_NEW (1 << 4)
#define EXTENT_DELALLOC (1 << 5)
#define EXTENT_DEFRAG (1 << 6)
-#define EXTENT_DEFRAG_DONE (1 << 7)
-#define EXTENT_BUFFER_FILLED (1 << 8)
#define EXTENT_BOUNDARY (1 << 9)
#define EXTENT_NODATASUM (1 << 10)
#define EXTENT_DO_ACCOUNTING (1 << 11)
@@ -34,16 +32,16 @@
/* these are bit numbers for test/set bit */
#define EXTENT_BUFFER_UPTODATE 0
-#define EXTENT_BUFFER_BLOCKING 1
#define EXTENT_BUFFER_DIRTY 2
#define EXTENT_BUFFER_CORRUPT 3
#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */
#define EXTENT_BUFFER_TREE_REF 5
#define EXTENT_BUFFER_STALE 6
#define EXTENT_BUFFER_WRITEBACK 7
-#define EXTENT_BUFFER_IOERR 8
+#define EXTENT_BUFFER_READ_ERR 8 /* read IO error */
#define EXTENT_BUFFER_DUMMY 9
#define EXTENT_BUFFER_IN_TREE 10
+#define EXTENT_BUFFER_WRITE_ERR 11 /* write IO error */
/* these are flags for extent_clear_unlock_delalloc */
#define PAGE_UNLOCK (1 << 0)
@@ -57,7 +55,6 @@
* map has page->private set to one.
*/
#define EXTENT_PAGE_PRIVATE 1
-#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
struct extent_state;
struct btrfs_root;
@@ -108,7 +105,6 @@ struct extent_state {
struct rb_node rb_node;
/* ADD NEW ELEMENTS AFTER THIS */
- struct extent_io_tree *tree;
wait_queue_head_t wq;
atomic_t refs;
unsigned long state;
@@ -126,8 +122,6 @@ struct extent_state {
struct extent_buffer {
u64 start;
unsigned long len;
- unsigned long map_start;
- unsigned long map_len;
unsigned long bflags;
struct btrfs_fs_info *fs_info;
spinlock_t refs_lock;
@@ -144,7 +138,9 @@ struct extent_buffer {
atomic_t blocking_readers;
atomic_t spinning_readers;
atomic_t spinning_writers;
- int lock_nested;
+ short lock_nested;
+ /* >= 0 if eb belongs to a log tree, -1 otherwise */
+ short log_index;
/* protects write locks */
rwlock_t lock;
@@ -286,12 +282,6 @@ static inline unsigned long num_extent_pages(u64 start, u64 len)
(start >> PAGE_CACHE_SHIFT);
}
-static inline struct page *extent_buffer_page(struct extent_buffer *eb,
- unsigned long i)
-{
- return eb->pages[i];
-}
-
static inline void extent_buffer_get(struct extent_buffer *eb)
{
atomic_inc(&eb->refs);
@@ -341,18 +331,50 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask);
struct btrfs_fs_info;
-int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
- u64 length, u64 logical, struct page *page,
- int mirror_num);
+int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
+ struct page *page, unsigned int pg_offset,
+ int mirror_num);
+int clean_io_failure(struct inode *inode, u64 start, struct page *page,
+ unsigned int pg_offset);
int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
int mirror_num);
+
+/*
+ * When IO fails, either with EIO or csum verification fails, we
+ * try other mirrors that might have a good copy of the data. This
+ * io_failure_record is used to record state as we go through all the
+ * mirrors. If another mirror has good data, the page is set up to date
+ * and things continue. If a good mirror can't be found, the original
+ * bio end_io callback is called to indicate things have failed.
+ */
+struct io_failure_record {
+ struct page *page;
+ u64 start;
+ u64 len;
+ u64 logical;
+ unsigned long bio_flags;
+ int this_mirror;
+ int failed_mirror;
+ int in_validation;
+};
+
+void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end);
+int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
+ struct io_failure_record **failrec_ret);
+int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
+ struct io_failure_record *failrec, int fail_mirror);
+struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
+ struct io_failure_record *failrec,
+ struct page *page, int pg_offset, int icsum,
+ bio_end_io_t *endio_func, void *data);
+int free_io_failure(struct inode *inode, struct io_failure_record *rec);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
noinline u64 find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree,
struct page *locked_page, u64 *start,
u64 *end, u64 max_bytes);
+#endif
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, unsigned long len);
#endif
-#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 54c84daec9b5..783a94355efd 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -55,7 +55,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
return -ENOMEM;
file_key.objectid = objectid;
file_key.offset = pos;
- btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+ file_key.type = BTRFS_EXTENT_DATA_KEY;
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &file_key,
@@ -100,7 +100,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
file_key.offset = bytenr;
- btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
+ file_key.type = BTRFS_EXTENT_CSUM_KEY;
ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
if (ret < 0)
goto fail;
@@ -111,7 +111,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
goto fail;
path->slots[0]--;
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY)
+ if (found_key.type != BTRFS_EXTENT_CSUM_KEY)
goto fail;
csum_offset = (bytenr - found_key.offset) >>
@@ -148,7 +148,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
file_key.objectid = objectid;
file_key.offset = offset;
- btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+ file_key.type = BTRFS_EXTENT_DATA_KEY;
ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
return ret;
}
@@ -299,19 +299,9 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
}
int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
- struct btrfs_dio_private *dip, struct bio *bio,
- u64 offset)
+ struct bio *bio, u64 offset)
{
- int len = (bio->bi_iter.bi_sector << 9) - dip->disk_bytenr;
- u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
- int ret;
-
- len >>= inode->i_sb->s_blocksize_bits;
- len *= csum_size;
-
- ret = __btrfs_lookup_bio_sums(root, inode, bio, offset,
- (u32 *)(dip->csum + len), 1);
- return ret;
+ return __btrfs_lookup_bio_sums(root, inode, bio, offset, NULL, 1);
}
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
@@ -329,8 +319,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
u64 csum_end;
u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
- ASSERT(start == ALIGN(start, root->sectorsize) &&
- (end + 1) == ALIGN(end + 1, root->sectorsize));
+ ASSERT(IS_ALIGNED(start, root->sectorsize) &&
+ IS_ALIGNED(end + 1, root->sectorsize));
path = btrfs_alloc_path();
if (!path)
@@ -720,7 +710,7 @@ again:
bytenr = sums->bytenr + total_bytes;
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
file_key.offset = bytenr;
- btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
+ file_key.type = BTRFS_EXTENT_CSUM_KEY;
item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
if (!IS_ERR(item)) {
@@ -790,7 +780,7 @@ again:
csum_offset = (bytenr - found_key.offset) >>
root->fs_info->sb->s_blocksize_bits;
- if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY ||
+ if (found_key.type != BTRFS_EXTENT_CSUM_KEY ||
found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
goto insert;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ff1cc0399b9a..a18ceabd99a8 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -299,7 +299,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
/* get the inode */
key.objectid = defrag->root;
- btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
index = srcu_read_lock(&fs_info->subvol_srcu);
@@ -311,7 +311,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
}
key.objectid = defrag->ino;
- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
if (IS_ERR(inode)) {
@@ -452,7 +452,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
if (unlikely(copied == 0))
break;
- if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
+ if (copied < PAGE_CACHE_SIZE - offset) {
offset += copied;
} else {
pg++;
@@ -1481,9 +1481,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
bool force_page_uptodate = false;
bool need_unlock;
- nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
- PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
- (sizeof(struct page *)));
+ nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_CACHE_SIZE),
+ PAGE_CACHE_SIZE / (sizeof(struct page *)));
nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
nrptrs = max(nrptrs, 8);
pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
@@ -1497,8 +1496,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
size_t write_bytes = min(iov_iter_count(i),
nrptrs * (size_t)PAGE_CACHE_SIZE -
offset);
- size_t num_pages = (write_bytes + offset +
- PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ size_t num_pages = DIV_ROUND_UP(write_bytes + offset,
+ PAGE_CACHE_SIZE);
size_t reserve_bytes;
size_t dirty_pages;
size_t copied;
@@ -1526,9 +1525,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
* our prealloc extent may be smaller than
* write_bytes, so scale down.
*/
- num_pages = (write_bytes + offset +
- PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
+ num_pages = DIV_ROUND_UP(write_bytes + offset,
+ PAGE_CACHE_SIZE);
reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
ret = 0;
} else {
@@ -1590,9 +1588,8 @@ again:
dirty_pages = 0;
} else {
force_page_uptodate = false;
- dirty_pages = (copied + offset +
- PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
+ dirty_pages = DIV_ROUND_UP(copied + offset,
+ PAGE_CACHE_SIZE);
}
/*
@@ -1653,7 +1650,7 @@ again:
cond_resched();
balance_dirty_pages_ratelimited(inode->i_mapping);
- if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+ if (dirty_pages < (root->nodesize >> PAGE_CACHE_SHIFT) + 1)
btrfs_btree_balance_dirty(root);
pos += copied;
@@ -1795,7 +1792,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
if (sync)
atomic_inc(&BTRFS_I(inode)->sync_writers);
- if (unlikely(file->f_flags & O_DIRECT)) {
+ if (file->f_flags & O_DIRECT) {
num_written = __btrfs_direct_write(iocb, from, pos);
} else {
num_written = __btrfs_buffered_write(file, from, pos);
@@ -1852,6 +1849,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
return 0;
}
+static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
+{
+ int ret;
+
+ atomic_inc(&BTRFS_I(inode)->sync_writers);
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+ if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+ atomic_dec(&BTRFS_I(inode)->sync_writers);
+
+ return ret;
+}
+
/*
* fsync call for both files and directories. This logs the inode into
* the tree log instead of forcing full commits whenever possible.
@@ -1881,30 +1892,64 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* multi-task, and make the performance up. See
* btrfs_wait_ordered_range for an explanation of the ASYNC check.
*/
- atomic_inc(&BTRFS_I(inode)->sync_writers);
- ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
- if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
- atomic_dec(&BTRFS_I(inode)->sync_writers);
+ ret = start_ordered_ops(inode, start, end);
if (ret)
return ret;
mutex_lock(&inode->i_mutex);
-
- /*
- * We flush the dirty pages again to avoid some dirty pages in the
- * range being left.
- */
atomic_inc(&root->log_batch);
full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
+ /*
+ * We might have have had more pages made dirty after calling
+ * start_ordered_ops and before acquiring the inode's i_mutex.
+ */
if (full_sync) {
+ /*
+ * For a full sync, we need to make sure any ordered operations
+ * start and finish before we start logging the inode, so that
+ * all extents are persisted and the respective file extent
+ * items are in the fs/subvol btree.
+ */
ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
- if (ret) {
- mutex_unlock(&inode->i_mutex);
- goto out;
- }
+ } else {
+ /*
+ * Start any new ordered operations before starting to log the
+ * inode. We will wait for them to finish in btrfs_sync_log().
+ *
+ * Right before acquiring the inode's mutex, we might have new
+ * writes dirtying pages, which won't immediately start the
+ * respective ordered operations - that is done through the
+ * fill_delalloc callbacks invoked from the writepage and
+ * writepages address space operations. So make sure we start
+ * all ordered operations before starting to log our inode. Not
+ * doing this means that while logging the inode, writeback
+ * could start and invoke writepage/writepages, which would call
+ * the fill_delalloc callbacks (cow_file_range,
+ * submit_compressed_extents). These callbacks add first an
+ * extent map to the modified list of extents and then create
+ * the respective ordered operation, which means in
+ * tree-log.c:btrfs_log_inode() we might capture all existing
+ * ordered operations (with btrfs_get_logged_extents()) before
+ * the fill_delalloc callback adds its ordered operation, and by
+ * the time we visit the modified list of extent maps (with
+ * btrfs_log_changed_extents()), we see and process the extent
+ * map they created. We then use the extent map to construct a
+ * file extent item for logging without waiting for the
+ * respective ordered operation to finish - this file extent
+ * item points to a disk location that might not have yet been
+ * written to, containing random data - so after a crash a log
+ * replay will make our inode have file extent items that point
+ * to disk locations containing invalid data, as we returned
+ * success to userspace without waiting for the respective
+ * ordered operation to finish, because it wasn't captured by
+ * btrfs_get_logged_extents().
+ */
+ ret = start_ordered_ops(inode, start, end);
+ }
+ if (ret) {
+ mutex_unlock(&inode->i_mutex);
+ goto out;
}
atomic_inc(&root->log_batch);
@@ -1984,6 +2029,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
mutex_unlock(&inode->i_mutex);
+ /*
+ * If any of the ordered extents had an error, just return it to user
+ * space, so that the application knows some writes didn't succeed and
+ * can take proper action (retry for e.g.). Blindly committing the
+ * transaction in this case, would fool userspace that everything was
+ * successful. And we also want to make sure our log doesn't contain
+ * file extent items pointing to extents that weren't fully written to -
+ * just like in the non fast fsync path, where we check for the ordered
+ * operation's error flag before writing to the log tree and return -EIO
+ * if any of them had this flag set (btrfs_wait_ordered_range) -
+ * therefore we need to check for errors in the ordered operations,
+ * which are indicated by ctx.io_err.
+ */
+ if (ctx.io_err) {
+ btrfs_end_transaction(trans, root);
+ ret = ctx.io_err;
+ goto out;
+ }
+
if (ret != BTRFS_NO_LOG_SYNC) {
if (!ret) {
ret = btrfs_sync_log(trans, root, &ctx);
@@ -2621,23 +2685,28 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
- u64 lockstart = *offset;
- u64 lockend = i_size_read(inode);
- u64 start = *offset;
- u64 len = i_size_read(inode);
+ u64 lockstart;
+ u64 lockend;
+ u64 start;
+ u64 len;
int ret = 0;
- lockend = max_t(u64, root->sectorsize, lockend);
+ if (inode->i_size == 0)
+ return -ENXIO;
+
+ /*
+ * *offset can be negative, in this case we start finding DATA/HOLE from
+ * the very start of the file.
+ */
+ start = max_t(loff_t, 0, *offset);
+
+ lockstart = round_down(start, root->sectorsize);
+ lockend = round_up(i_size_read(inode), root->sectorsize);
if (lockend <= lockstart)
lockend = lockstart + root->sectorsize;
-
lockend--;
len = lockend - lockstart + 1;
- len = max_t(u64, len, root->sectorsize);
- if (inode->i_size == 0)
- return -ENXIO;
-
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
&cached_state);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 2b0a627cb5f9..33848196550e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -279,8 +279,7 @@ static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
int num_pages;
int check_crcs = 0;
- num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
+ num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
check_crcs = 1;
@@ -1998,6 +1997,128 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
return merged;
}
+static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info,
+ bool update_stat)
+{
+ struct btrfs_free_space *bitmap;
+ unsigned long i;
+ unsigned long j;
+ const u64 end = info->offset + info->bytes;
+ const u64 bitmap_offset = offset_to_bitmap(ctl, end);
+ u64 bytes;
+
+ bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
+ if (!bitmap)
+ return false;
+
+ i = offset_to_bit(bitmap->offset, ctl->unit, end);
+ j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
+ if (j == i)
+ return false;
+ bytes = (j - i) * ctl->unit;
+ info->bytes += bytes;
+
+ if (update_stat)
+ bitmap_clear_bits(ctl, bitmap, end, bytes);
+ else
+ __bitmap_clear_bits(ctl, bitmap, end, bytes);
+
+ if (!bitmap->bytes)
+ free_bitmap(ctl, bitmap);
+
+ return true;
+}
+
+static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info,
+ bool update_stat)
+{
+ struct btrfs_free_space *bitmap;
+ u64 bitmap_offset;
+ unsigned long i;
+ unsigned long j;
+ unsigned long prev_j;
+ u64 bytes;
+
+ bitmap_offset = offset_to_bitmap(ctl, info->offset);
+ /* If we're on a boundary, try the previous logical bitmap. */
+ if (bitmap_offset == info->offset) {
+ if (info->offset == 0)
+ return false;
+ bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
+ }
+
+ bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
+ if (!bitmap)
+ return false;
+
+ i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
+ j = 0;
+ prev_j = (unsigned long)-1;
+ for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
+ if (j > i)
+ break;
+ prev_j = j;
+ }
+ if (prev_j == i)
+ return false;
+
+ if (prev_j == (unsigned long)-1)
+ bytes = (i + 1) * ctl->unit;
+ else
+ bytes = (i - prev_j) * ctl->unit;
+
+ info->offset -= bytes;
+ info->bytes += bytes;
+
+ if (update_stat)
+ bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
+ else
+ __bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
+
+ if (!bitmap->bytes)
+ free_bitmap(ctl, bitmap);
+
+ return true;
+}
+
+/*
+ * We prefer always to allocate from extent entries, both for clustered and
+ * non-clustered allocation requests. So when attempting to add a new extent
+ * entry, try to see if there's adjacent free space in bitmap entries, and if
+ * there is, migrate that space from the bitmaps to the extent.
+ * Like this we get better chances of satisfying space allocation requests
+ * because we attempt to satisfy them based on a single cache entry, and never
+ * on 2 or more entries - even if the entries represent a contiguous free space
+ * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
+ * ends).
+ */
+static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info,
+ bool update_stat)
+{
+ /*
+ * Only work with disconnected entries, as we can change their offset,
+ * and must be extent entries.
+ */
+ ASSERT(!info->bitmap);
+ ASSERT(RB_EMPTY_NODE(&info->offset_index));
+
+ if (ctl->total_bitmaps > 0) {
+ bool stole_end;
+ bool stole_front = false;
+
+ stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
+ if (ctl->total_bitmaps > 0)
+ stole_front = steal_from_bitmap_to_front(ctl, info,
+ update_stat);
+
+ if (stole_end || stole_front)
+ try_merge_free_space(ctl, info, update_stat);
+ }
+}
+
int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
u64 offset, u64 bytes)
{
@@ -2010,6 +2131,7 @@ int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
info->offset = offset;
info->bytes = bytes;
+ RB_CLEAR_NODE(&info->offset_index);
spin_lock(&ctl->tree_lock);
@@ -2029,6 +2151,14 @@ int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
goto out;
}
link:
+ /*
+ * Only steal free space from adjacent bitmaps if we're sure we're not
+ * going to add the new free space to existing bitmap entries - because
+ * that would mean unnecessary work that would be reverted. Therefore
+ * attempt to steal space from bitmaps if we're adding an extent entry.
+ */
+ steal_from_bitmap(ctl, info, true);
+
ret = link_free_space(ctl, info);
if (ret)
kmem_cache_free(btrfs_free_space_cachep, info);
@@ -2205,10 +2335,13 @@ __btrfs_return_cluster_to_free_space(
entry = rb_entry(node, struct btrfs_free_space, offset_index);
node = rb_next(&entry->offset_index);
rb_erase(&entry->offset_index, &cluster->root);
+ RB_CLEAR_NODE(&entry->offset_index);
bitmap = (entry->bitmap != NULL);
- if (!bitmap)
+ if (!bitmap) {
try_merge_free_space(ctl, entry, false);
+ steal_from_bitmap(ctl, entry, false);
+ }
tree_insert_offset(&ctl->free_space_offset,
entry->offset, &entry->offset_index, bitmap);
}
@@ -3033,10 +3166,10 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root,
{
struct inode *inode = NULL;
- spin_lock(&root->cache_lock);
- if (root->cache_inode)
- inode = igrab(root->cache_inode);
- spin_unlock(&root->cache_lock);
+ spin_lock(&root->ino_cache_lock);
+ if (root->ino_cache_inode)
+ inode = igrab(root->ino_cache_inode);
+ spin_unlock(&root->ino_cache_lock);
if (inode)
return inode;
@@ -3044,10 +3177,10 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root,
if (IS_ERR(inode))
return inode;
- spin_lock(&root->cache_lock);
+ spin_lock(&root->ino_cache_lock);
if (!btrfs_fs_closing(root->fs_info))
- root->cache_inode = igrab(inode);
- spin_unlock(&root->cache_lock);
+ root->ino_cache_inode = igrab(inode);
+ spin_unlock(&root->ino_cache_lock);
return inode;
}
@@ -3176,6 +3309,7 @@ again:
map = NULL;
add_new_bitmap(ctl, info, offset);
bitmap_info = info;
+ info = NULL;
}
bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
@@ -3186,6 +3320,8 @@ again:
if (bytes)
goto again;
+ if (info)
+ kmem_cache_free(btrfs_free_space_cachep, info);
if (map)
kfree(map);
return 0;
@@ -3260,6 +3396,7 @@ have_info:
goto have_info;
}
+ ret = 0;
goto out;
}
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
index 85889aa82c62..64f15bb30a81 100644
--- a/fs/btrfs/hash.c
+++ b/fs/btrfs/hash.c
@@ -20,10 +20,8 @@ static struct crypto_shash *tfm;
int __init btrfs_hash_init(void)
{
tfm = crypto_alloc_shash("crc32c", 0, 0);
- if (IS_ERR(tfm))
- return PTR_ERR(tfm);
- return 0;
+ return PTR_ERR_OR_ZERO(tfm);
}
void btrfs_hash_exit(void)
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 2be38df703c9..8ffa4783cbf4 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -135,7 +135,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
u32 item_size;
key.objectid = inode_objectid;
- btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+ key.type = BTRFS_INODE_EXTREF_KEY;
key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
path = btrfs_alloc_path();
@@ -209,7 +209,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
key.objectid = inode_objectid;
key.offset = ref_objectid;
- btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+ key.type = BTRFS_INODE_REF_KEY;
path = btrfs_alloc_path();
if (!path)
@@ -337,7 +337,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
key.objectid = inode_objectid;
key.offset = ref_objectid;
- btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+ key.type = BTRFS_INODE_REF_KEY;
path = btrfs_alloc_path();
if (!path)
@@ -400,7 +400,7 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
struct btrfs_key key;
int ret;
key.objectid = objectid;
- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
ret = btrfs_insert_empty_item(trans, root, path, &key,
@@ -420,13 +420,13 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
struct btrfs_key found_key;
ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
- if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY &&
+ if (ret > 0 && location->type == BTRFS_ROOT_ITEM_KEY &&
location->offset == (u64)-1 && path->slots[0] != 0) {
slot = path->slots[0] - 1;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.objectid == location->objectid &&
- btrfs_key_type(&found_key) == btrfs_key_type(location)) {
+ found_key.type == location->type) {
path->slots[0]--;
return 0;
}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 888fbe19079f..83d646bd2e4b 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -87,7 +87,7 @@ again:
*/
btrfs_item_key_to_cpu(leaf, &key, 0);
btrfs_release_path(path);
- root->cache_progress = last;
+ root->ino_cache_progress = last;
up_read(&fs_info->commit_root_sem);
schedule_timeout(1);
goto again;
@@ -106,7 +106,7 @@ again:
if (last != (u64)-1 && last + 1 != key.objectid) {
__btrfs_add_free_space(ctl, last + 1,
key.objectid - last - 1);
- wake_up(&root->cache_wait);
+ wake_up(&root->ino_cache_wait);
}
last = key.objectid;
@@ -119,14 +119,14 @@ next:
root->highest_objectid - last - 1);
}
- spin_lock(&root->cache_lock);
- root->cached = BTRFS_CACHE_FINISHED;
- spin_unlock(&root->cache_lock);
+ spin_lock(&root->ino_cache_lock);
+ root->ino_cache_state = BTRFS_CACHE_FINISHED;
+ spin_unlock(&root->ino_cache_lock);
- root->cache_progress = (u64)-1;
+ root->ino_cache_progress = (u64)-1;
btrfs_unpin_free_ino(root);
out:
- wake_up(&root->cache_wait);
+ wake_up(&root->ino_cache_wait);
up_read(&fs_info->commit_root_sem);
btrfs_free_path(path);
@@ -144,20 +144,20 @@ static void start_caching(struct btrfs_root *root)
if (!btrfs_test_opt(root, INODE_MAP_CACHE))
return;
- spin_lock(&root->cache_lock);
- if (root->cached != BTRFS_CACHE_NO) {
- spin_unlock(&root->cache_lock);
+ spin_lock(&root->ino_cache_lock);
+ if (root->ino_cache_state != BTRFS_CACHE_NO) {
+ spin_unlock(&root->ino_cache_lock);
return;
}
- root->cached = BTRFS_CACHE_STARTED;
- spin_unlock(&root->cache_lock);
+ root->ino_cache_state = BTRFS_CACHE_STARTED;
+ spin_unlock(&root->ino_cache_lock);
ret = load_free_ino_cache(root->fs_info, root);
if (ret == 1) {
- spin_lock(&root->cache_lock);
- root->cached = BTRFS_CACHE_FINISHED;
- spin_unlock(&root->cache_lock);
+ spin_lock(&root->ino_cache_lock);
+ root->ino_cache_state = BTRFS_CACHE_FINISHED;
+ spin_unlock(&root->ino_cache_lock);
return;
}
@@ -196,11 +196,11 @@ again:
start_caching(root);
- wait_event(root->cache_wait,
- root->cached == BTRFS_CACHE_FINISHED ||
+ wait_event(root->ino_cache_wait,
+ root->ino_cache_state == BTRFS_CACHE_FINISHED ||
root->free_ino_ctl->free_space > 0);
- if (root->cached == BTRFS_CACHE_FINISHED &&
+ if (root->ino_cache_state == BTRFS_CACHE_FINISHED &&
root->free_ino_ctl->free_space == 0)
return -ENOSPC;
else
@@ -214,17 +214,17 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
if (!btrfs_test_opt(root, INODE_MAP_CACHE))
return;
again:
- if (root->cached == BTRFS_CACHE_FINISHED) {
+ if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
__btrfs_add_free_space(pinned, objectid, 1);
} else {
down_write(&root->fs_info->commit_root_sem);
- spin_lock(&root->cache_lock);
- if (root->cached == BTRFS_CACHE_FINISHED) {
- spin_unlock(&root->cache_lock);
+ spin_lock(&root->ino_cache_lock);
+ if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
+ spin_unlock(&root->ino_cache_lock);
up_write(&root->fs_info->commit_root_sem);
goto again;
}
- spin_unlock(&root->cache_lock);
+ spin_unlock(&root->ino_cache_lock);
start_caching(root);
@@ -235,10 +235,10 @@ again:
}
/*
- * When a transaction is committed, we'll move those inode numbers which
- * are smaller than root->cache_progress from pinned tree to free_ino tree,
- * and others will just be dropped, because the commit root we were
- * searching has changed.
+ * When a transaction is committed, we'll move those inode numbers which are
+ * smaller than root->ino_cache_progress from pinned tree to free_ino tree, and
+ * others will just be dropped, because the commit root we were searching has
+ * changed.
*
* Must be called with root->fs_info->commit_root_sem held
*/
@@ -261,10 +261,10 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
info = rb_entry(n, struct btrfs_free_space, offset_index);
BUG_ON(info->bitmap); /* Logic error */
- if (info->offset > root->cache_progress)
+ if (info->offset > root->ino_cache_progress)
goto free;
- else if (info->offset + info->bytes > root->cache_progress)
- count = root->cache_progress - info->offset + 1;
+ else if (info->offset + info->bytes > root->ino_cache_progress)
+ count = root->ino_cache_progress - info->offset + 1;
else
count = info->bytes;
@@ -462,13 +462,13 @@ again:
}
}
- spin_lock(&root->cache_lock);
- if (root->cached != BTRFS_CACHE_FINISHED) {
+ spin_lock(&root->ino_cache_lock);
+ if (root->ino_cache_state != BTRFS_CACHE_FINISHED) {
ret = -1;
- spin_unlock(&root->cache_lock);
+ spin_unlock(&root->ino_cache_lock);
goto out_put;
}
- spin_unlock(&root->cache_lock);
+ spin_unlock(&root->ino_cache_lock);
spin_lock(&ctl->tree_lock);
prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 016c403bfe7e..fc9c0439caa3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -153,7 +153,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
key.objectid = btrfs_ino(inode);
key.offset = start;
- btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+ key.type = BTRFS_EXTENT_DATA_KEY;
datasize = btrfs_file_extent_calc_inline_size(cur_size);
path->leave_spinning = 1;
@@ -249,8 +249,8 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
data_len = compressed_size;
if (start > 0 ||
- actual_end >= PAGE_CACHE_SIZE ||
- data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+ actual_end > PAGE_CACHE_SIZE ||
+ data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
(!compressed_size &&
(actual_end & (root->sectorsize - 1)) == 0) ||
end + 1 < isize ||
@@ -348,6 +348,23 @@ static noinline int add_async_extent(struct async_cow *cow,
return 0;
}
+static inline int inode_need_compress(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ /* force compress */
+ if (btrfs_test_opt(root, FORCE_COMPRESS))
+ return 1;
+ /* bad compression ratios */
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
+ return 0;
+ if (btrfs_test_opt(root, COMPRESS) ||
+ BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
+ BTRFS_I(inode)->force_compress)
+ return 1;
+ return 0;
+}
+
/*
* we create compressed extents in two phases. The first
* phase compresses a range of pages that have already been
@@ -444,10 +461,7 @@ again:
* inode has not been flagged as nocompress. This flag can
* change at any time if we discover bad compression ratios.
*/
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
- (btrfs_test_opt(root, COMPRESS) ||
- (BTRFS_I(inode)->force_compress) ||
- (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
+ if (inode_need_compress(inode)) {
WARN_ON(pages);
pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
if (!pages) {
@@ -1094,7 +1108,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
async_cow->locked_page = locked_page;
async_cow->start = start;
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
+ !btrfs_test_opt(root, FORCE_COMPRESS))
cur_end = end;
else
cur_end = min(end, start + 512 * 1024 - 1);
@@ -1445,6 +1460,26 @@ error:
return ret;
}
+static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
+{
+
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
+ return 0;
+
+ /*
+ * @defrag_bytes is a hint value, no spinlock held here,
+ * if is not zero, it means the file is defragging.
+ * Force cow if given extent needs to be defragged.
+ */
+ if (BTRFS_I(inode)->defrag_bytes &&
+ test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
+ EXTENT_DEFRAG, 0, NULL))
+ return 1;
+
+ return 0;
+}
+
/*
* extent_io.c call back to do delayed allocation processing
*/
@@ -1453,17 +1488,15 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
unsigned long *nr_written)
{
int ret;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ int force_cow = need_force_cow(inode, start, end);
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 1, nr_written);
- } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) {
+ } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 0, nr_written);
- } else if (!btrfs_test_opt(root, COMPRESS) &&
- !(BTRFS_I(inode)->force_compress) &&
- !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {
+ } else if (!inode_need_compress(inode)) {
ret = cow_file_range(inode, locked_page, start, end,
page_started, nr_written, 1);
} else {
@@ -1555,6 +1588,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
struct extent_state *state, unsigned long *bits)
{
+ if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
+ WARN_ON(1);
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testing for the DELALLOC
@@ -1577,6 +1612,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
root->fs_info->delalloc_batch);
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->delalloc_bytes += len;
+ if (*bits & EXTENT_DEFRAG)
+ BTRFS_I(inode)->defrag_bytes += len;
if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
&BTRFS_I(inode)->runtime_flags))
btrfs_add_delalloc_inodes(root, inode);
@@ -1591,6 +1628,13 @@ static void btrfs_clear_bit_hook(struct inode *inode,
struct extent_state *state,
unsigned long *bits)
{
+ u64 len = state->end + 1 - state->start;
+
+ spin_lock(&BTRFS_I(inode)->lock);
+ if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
+ BTRFS_I(inode)->defrag_bytes -= len;
+ spin_unlock(&BTRFS_I(inode)->lock);
+
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testing for the DELALLOC
@@ -1598,7 +1642,6 @@ static void btrfs_clear_bit_hook(struct inode *inode,
*/
if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
- u64 len = state->end + 1 - state->start;
bool do_list = !btrfs_is_free_space_inode(inode);
if (*bits & EXTENT_FIRST_DELALLOC) {
@@ -2660,6 +2703,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
+ btrfs_free_io_failure_record(inode, ordered_extent->file_offset,
+ ordered_extent->file_offset +
+ ordered_extent->len - 1);
+
if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
truncated = true;
logical_len = ordered_extent->truncated_len;
@@ -2856,6 +2903,40 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
return 0;
}
+static int __readpage_endio_check(struct inode *inode,
+ struct btrfs_io_bio *io_bio,
+ int icsum, struct page *page,
+ int pgoff, u64 start, size_t len)
+{
+ char *kaddr;
+ u32 csum_expected;
+ u32 csum = ~(u32)0;
+ static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ csum_expected = *(((u32 *)io_bio->csum) + icsum);
+
+ kaddr = kmap_atomic(page);
+ csum = btrfs_csum_data(kaddr + pgoff, csum, len);
+ btrfs_csum_final(csum, (char *)&csum);
+ if (csum != csum_expected)
+ goto zeroit;
+
+ kunmap_atomic(kaddr);
+ return 0;
+zeroit:
+ if (__ratelimit(&_rs))
+ btrfs_info(BTRFS_I(inode)->root->fs_info,
+ "csum failed ino %llu off %llu csum %u expected csum %u",
+ btrfs_ino(inode), start, csum, csum_expected);
+ memset(kaddr + pgoff, 1, len);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr);
+ if (csum_expected == 0)
+ return 0;
+ return -EIO;
+}
+
/*
* when reads are done, we need to check csums to verify the data is correct
* if there's a match, we allow the bio to finish. If not, the code in
@@ -2868,20 +2949,15 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
size_t offset = start - page_offset(page);
struct inode *inode = page->mapping->host;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- char *kaddr;
struct btrfs_root *root = BTRFS_I(inode)->root;
- u32 csum_expected;
- u32 csum = ~(u32)0;
- static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
if (PageChecked(page)) {
ClearPageChecked(page);
- goto good;
+ return 0;
}
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
- goto good;
+ return 0;
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
@@ -2891,28 +2967,8 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
}
phy_offset >>= inode->i_sb->s_blocksize_bits;
- csum_expected = *(((u32 *)io_bio->csum) + phy_offset);
-
- kaddr = kmap_atomic(page);
- csum = btrfs_csum_data(kaddr + offset, csum, end - start + 1);
- btrfs_csum_final(csum, (char *)&csum);
- if (csum != csum_expected)
- goto zeroit;
-
- kunmap_atomic(kaddr);
-good:
- return 0;
-
-zeroit:
- if (__ratelimit(&_rs))
- btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
- btrfs_ino(page->mapping->host), start, csum, csum_expected);
- memset(kaddr + offset, 1, end - start + 1);
- flush_dcache_page(page);
- kunmap_atomic(kaddr);
- if (csum_expected == 0)
- return 0;
- return -EIO;
+ return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
+ start, (size_t)(end - start + 1));
}
struct delayed_iput {
@@ -3159,7 +3215,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
path->reada = -1;
key.objectid = BTRFS_ORPHAN_OBJECTID;
- btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = (u64)-1;
while (1) {
@@ -3186,7 +3242,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
/* make sure the item matches what we want */
if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
break;
- if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
+ if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
break;
/* release the path since we're done with it */
@@ -3662,7 +3718,8 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
* without delay
*/
if (!btrfs_is_free_space_inode(inode)
- && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+ && !root->fs_info->log_root_recovering) {
btrfs_update_root_times(trans, root);
ret = btrfs_delayed_update_inode(trans, root, inode);
@@ -4085,7 +4142,7 @@ search_again:
fi = NULL;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- found_type = btrfs_key_type(&found_key);
+ found_type = found_key.type;
if (found_key.objectid != ino)
break;
@@ -4747,6 +4804,8 @@ void btrfs_evict_inode(struct inode *inode)
/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ btrfs_free_io_failure_record(inode, 0, (u64)-1);
+
if (root->fs_info->log_root_recovering) {
BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
&BTRFS_I(inode)->runtime_flags));
@@ -5331,7 +5390,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
btrfs_get_delayed_items(inode, &ins_list, &del_list);
}
- btrfs_set_key_type(&key, key_type);
+ key.type = key_type;
key.offset = ctx->pos;
key.objectid = btrfs_ino(inode);
@@ -5356,7 +5415,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
if (found_key.objectid != key.objectid)
break;
- if (btrfs_key_type(&found_key) != key_type)
+ if (found_key.type != key_type)
break;
if (found_key.offset < ctx->pos)
goto next;
@@ -5568,7 +5627,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
int ret;
key.objectid = btrfs_ino(inode);
- btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+ key.type = BTRFS_DIR_INDEX_KEY;
key.offset = (u64)-1;
path = btrfs_alloc_path();
@@ -5600,7 +5659,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
if (found_key.objectid != btrfs_ino(inode) ||
- btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
+ found_key.type != BTRFS_DIR_INDEX_KEY) {
BTRFS_I(inode)->index_cnt = 2;
goto out;
}
@@ -5718,7 +5777,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
key[0].objectid = objectid;
- btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
+ key[0].type = BTRFS_INODE_ITEM_KEY;
key[0].offset = 0;
sizes[0] = sizeof(struct btrfs_inode_item);
@@ -5731,7 +5790,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
* add more hard links than can fit in the ref item.
*/
key[1].objectid = objectid;
- btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
+ key[1].type = BTRFS_INODE_REF_KEY;
key[1].offset = ref_objectid;
sizes[1] = name_len + sizeof(*ref);
@@ -5740,7 +5799,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
location = &BTRFS_I(inode)->location;
location->objectid = objectid;
location->offset = 0;
- btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+ location->type = BTRFS_INODE_ITEM_KEY;
ret = btrfs_insert_inode_locked(inode);
if (ret < 0)
@@ -5832,7 +5891,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
} else {
key.objectid = ino;
- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
}
@@ -6191,21 +6250,60 @@ out_fail_inode:
goto out_fail;
}
+/* Find next extent map of a given extent map, caller needs to ensure locks */
+static struct extent_map *next_extent_map(struct extent_map *em)
+{
+ struct rb_node *next;
+
+ next = rb_next(&em->rb_node);
+ if (!next)
+ return NULL;
+ return container_of(next, struct extent_map, rb_node);
+}
+
+static struct extent_map *prev_extent_map(struct extent_map *em)
+{
+ struct rb_node *prev;
+
+ prev = rb_prev(&em->rb_node);
+ if (!prev)
+ return NULL;
+ return container_of(prev, struct extent_map, rb_node);
+}
+
/* helper for btfs_get_extent. Given an existing extent in the tree,
+ * the existing extent is the nearest extent to map_start,
* and an extent that you want to insert, deal with overlap and insert
- * the new extent into the tree.
+ * the best fitted new extent into the tree.
*/
static int merge_extent_mapping(struct extent_map_tree *em_tree,
struct extent_map *existing,
struct extent_map *em,
u64 map_start)
{
+ struct extent_map *prev;
+ struct extent_map *next;
+ u64 start;
+ u64 end;
u64 start_diff;
BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
- start_diff = map_start - em->start;
- em->start = map_start;
- em->len = existing->start - em->start;
+
+ if (existing->start > map_start) {
+ next = existing;
+ prev = prev_extent_map(next);
+ } else {
+ prev = existing;
+ next = next_extent_map(prev);
+ }
+
+ start = prev ? extent_map_end(prev) : em->start;
+ start = max_t(u64, start, em->start);
+ end = next ? next->start : extent_map_end(em);
+ end = min_t(u64, end, extent_map_end(em));
+ start_diff = start - em->start;
+ em->start = start;
+ em->len = end - start;
if (em->block_start < EXTENT_MAP_LAST_BYTE &&
!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
em->block_start += start_diff;
@@ -6333,7 +6431,7 @@ again:
struct btrfs_file_extent_item);
/* are we inside the extent that was found? */
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- found_type = btrfs_key_type(&found_key);
+ found_type = found_key.type;
if (found_key.objectid != objectid ||
found_type != BTRFS_EXTENT_DATA_KEY) {
/*
@@ -6482,25 +6580,21 @@ insert:
ret = 0;
- existing = lookup_extent_mapping(em_tree, start, len);
- if (existing && (existing->start > start ||
- existing->start + existing->len <= start)) {
+ existing = search_extent_mapping(em_tree, start, len);
+ /*
+ * existing will always be non-NULL, since there must be
+ * extent causing the -EEXIST.
+ */
+ if (start >= extent_map_end(existing) ||
+ start <= existing->start) {
+ /*
+ * The existing extent map is the one nearest to
+ * the [start, start + len) range which overlaps
+ */
+ err = merge_extent_mapping(em_tree, existing,
+ em, start);
free_extent_map(existing);
- existing = NULL;
- }
- if (!existing) {
- existing = lookup_extent_mapping(em_tree, em->start,
- em->len);
- if (existing) {
- err = merge_extent_mapping(em_tree, existing,
- em, start);
- free_extent_map(existing);
- if (err) {
- free_extent_map(em);
- em = NULL;
- }
- } else {
- err = -EIO;
+ if (err) {
free_extent_map(em);
em = NULL;
}
@@ -7112,8 +7206,10 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
block_start, len,
orig_block_len,
ram_bytes, type);
- if (IS_ERR(em))
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
goto unlock_err;
+ }
}
ret = btrfs_add_ordered_extent_dio(inode, start,
@@ -7188,45 +7284,277 @@ unlock_err:
return ret;
}
-static void btrfs_endio_direct_read(struct bio *bio, int err)
+static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
+ int rw, int mirror_num)
{
- struct btrfs_dio_private *dip = bio->bi_private;
- struct bio_vec *bvec;
- struct inode *inode = dip->inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct bio *dio_bio;
- u32 *csums = (u32 *)dip->csum;
+ int ret;
+
+ BUG_ON(rw & REQ_WRITE);
+
+ bio_get(bio);
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio,
+ BTRFS_WQ_ENDIO_DIO_REPAIR);
+ if (ret)
+ goto err;
+
+ ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
+err:
+ bio_put(bio);
+ return ret;
+}
+
+static int btrfs_check_dio_repairable(struct inode *inode,
+ struct bio *failed_bio,
+ struct io_failure_record *failrec,
+ int failed_mirror)
+{
+ int num_copies;
+
+ num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
+ failrec->logical, failrec->len);
+ if (num_copies == 1) {
+ /*
+ * we only have a single copy of the data, so don't bother with
+ * all the retry and error correction code that follows. no
+ * matter what the error is, it is very likely to persist.
+ */
+ pr_debug("Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
+ num_copies, failrec->this_mirror, failed_mirror);
+ return 0;
+ }
+
+ failrec->failed_mirror = failed_mirror;
+ failrec->this_mirror++;
+ if (failrec->this_mirror == failed_mirror)
+ failrec->this_mirror++;
+
+ if (failrec->this_mirror > num_copies) {
+ pr_debug("Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
+ num_copies, failrec->this_mirror, failed_mirror);
+ return 0;
+ }
+
+ return 1;
+}
+
+static int dio_read_error(struct inode *inode, struct bio *failed_bio,
+ struct page *page, u64 start, u64 end,
+ int failed_mirror, bio_end_io_t *repair_endio,
+ void *repair_arg)
+{
+ struct io_failure_record *failrec;
+ struct bio *bio;
+ int isector;
+ int read_mode;
+ int ret;
+
+ BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+
+ ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
+ if (ret)
+ return ret;
+
+ ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
+ failed_mirror);
+ if (!ret) {
+ free_io_failure(inode, failrec);
+ return -EIO;
+ }
+
+ if (failed_bio->bi_vcnt > 1)
+ read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+ else
+ read_mode = READ_SYNC;
+
+ isector = start - btrfs_io_bio(failed_bio)->logical;
+ isector >>= inode->i_sb->s_blocksize_bits;
+ bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
+ 0, isector, repair_endio, repair_arg);
+ if (!bio) {
+ free_io_failure(inode, failrec);
+ return -EIO;
+ }
+
+ btrfs_debug(BTRFS_I(inode)->root->fs_info,
+ "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n",
+ read_mode, failrec->this_mirror, failrec->in_validation);
+
+ ret = submit_dio_repair_bio(inode, bio, read_mode,
+ failrec->this_mirror);
+ if (ret) {
+ free_io_failure(inode, failrec);
+ bio_put(bio);
+ }
+
+ return ret;
+}
+
+struct btrfs_retry_complete {
+ struct completion done;
+ struct inode *inode;
+ u64 start;
+ int uptodate;
+};
+
+static void btrfs_retry_endio_nocsum(struct bio *bio, int err)
+{
+ struct btrfs_retry_complete *done = bio->bi_private;
+ struct bio_vec *bvec;
+ int i;
+
+ if (err)
+ goto end;
+
+ done->uptodate = 1;
+ bio_for_each_segment_all(bvec, bio, i)
+ clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
+end:
+ complete(&done->done);
+ bio_put(bio);
+}
+
+static int __btrfs_correct_data_nocsum(struct inode *inode,
+ struct btrfs_io_bio *io_bio)
+{
+ struct bio_vec *bvec;
+ struct btrfs_retry_complete done;
u64 start;
int i;
+ int ret;
+
+ start = io_bio->logical;
+ done.inode = inode;
+
+ bio_for_each_segment_all(bvec, &io_bio->bio, i) {
+try_again:
+ done.uptodate = 0;
+ done.start = start;
+ init_completion(&done.done);
+
+ ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
+ start + bvec->bv_len - 1,
+ io_bio->mirror_num,
+ btrfs_retry_endio_nocsum, &done);
+ if (ret)
+ return ret;
+
+ wait_for_completion(&done.done);
+
+ if (!done.uptodate) {
+ /* We might have another mirror, so try again */
+ goto try_again;
+ }
+
+ start += bvec->bv_len;
+ }
+
+ return 0;
+}
+
+static void btrfs_retry_endio(struct bio *bio, int err)
+{
+ struct btrfs_retry_complete *done = bio->bi_private;
+ struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+ struct bio_vec *bvec;
+ int uptodate;
+ int ret;
+ int i;
+
+ if (err)
+ goto end;
- start = dip->logical_offset;
+ uptodate = 1;
bio_for_each_segment_all(bvec, bio, i) {
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
- struct page *page = bvec->bv_page;
- char *kaddr;
- u32 csum = ~(u32)0;
- unsigned long flags;
-
- local_irq_save(flags);
- kaddr = kmap_atomic(page);
- csum = btrfs_csum_data(kaddr + bvec->bv_offset,
- csum, bvec->bv_len);
- btrfs_csum_final(csum, (char *)&csum);
- kunmap_atomic(kaddr);
- local_irq_restore(flags);
-
- flush_dcache_page(bvec->bv_page);
- if (csum != csums[i]) {
- btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
- btrfs_ino(inode), start, csum,
- csums[i]);
- err = -EIO;
- }
+ ret = __readpage_endio_check(done->inode, io_bio, i,
+ bvec->bv_page, 0,
+ done->start, bvec->bv_len);
+ if (!ret)
+ clean_io_failure(done->inode, done->start,
+ bvec->bv_page, 0);
+ else
+ uptodate = 0;
+ }
+
+ done->uptodate = uptodate;
+end:
+ complete(&done->done);
+ bio_put(bio);
+}
+
+static int __btrfs_subio_endio_read(struct inode *inode,
+ struct btrfs_io_bio *io_bio, int err)
+{
+ struct bio_vec *bvec;
+ struct btrfs_retry_complete done;
+ u64 start;
+ u64 offset = 0;
+ int i;
+ int ret;
+
+ err = 0;
+ start = io_bio->logical;
+ done.inode = inode;
+
+ bio_for_each_segment_all(bvec, &io_bio->bio, i) {
+ ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
+ 0, start, bvec->bv_len);
+ if (likely(!ret))
+ goto next;
+try_again:
+ done.uptodate = 0;
+ done.start = start;
+ init_completion(&done.done);
+
+ ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
+ start + bvec->bv_len - 1,
+ io_bio->mirror_num,
+ btrfs_retry_endio, &done);
+ if (ret) {
+ err = ret;
+ goto next;
}
+ wait_for_completion(&done.done);
+
+ if (!done.uptodate) {
+ /* We might have another mirror, so try again */
+ goto try_again;
+ }
+next:
+ offset += bvec->bv_len;
start += bvec->bv_len;
}
+ return err;
+}
+
+static int btrfs_subio_endio_read(struct inode *inode,
+ struct btrfs_io_bio *io_bio, int err)
+{
+ bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+ if (skip_csum) {
+ if (unlikely(err))
+ return __btrfs_correct_data_nocsum(inode, io_bio);
+ else
+ return 0;
+ } else {
+ return __btrfs_subio_endio_read(inode, io_bio, err);
+ }
+}
+
+static void btrfs_endio_direct_read(struct bio *bio, int err)
+{
+ struct btrfs_dio_private *dip = bio->bi_private;
+ struct inode *inode = dip->inode;
+ struct bio *dio_bio;
+ struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+
+ if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
+ err = btrfs_subio_endio_read(inode, io_bio, err);
+
unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
dip->logical_offset + dip->bytes - 1);
dio_bio = dip->dio_bio;
@@ -7237,6 +7565,9 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
if (err)
clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
dio_end_io(dio_bio, err);
+
+ if (io_bio->end_io)
+ io_bio->end_io(io_bio, err);
bio_put(bio);
}
@@ -7302,12 +7633,17 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
{
struct btrfs_dio_private *dip = bio->bi_private;
+ if (err)
+ btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
+ "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
+ btrfs_ino(dip->inode), bio->bi_rw,
+ (unsigned long long)bio->bi_iter.bi_sector,
+ bio->bi_iter.bi_size, err);
+
+ if (dip->subio_endio)
+ err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
+
if (err) {
- btrfs_err(BTRFS_I(dip->inode)->root->fs_info,
- "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
- btrfs_ino(dip->inode), bio->bi_rw,
- (unsigned long long)bio->bi_iter.bi_sector,
- bio->bi_iter.bi_size, err);
dip->errors = 1;
/*
@@ -7338,6 +7674,38 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
}
+static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
+ struct inode *inode,
+ struct btrfs_dio_private *dip,
+ struct bio *bio,
+ u64 file_offset)
+{
+ struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+ struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
+ int ret;
+
+ /*
+ * We load all the csum data we need when we submit
+ * the first bio to reduce the csum tree search and
+ * contention.
+ */
+ if (dip->logical_offset == file_offset) {
+ ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio,
+ file_offset);
+ if (ret)
+ return ret;
+ }
+
+ if (bio == dip->orig_bio)
+ return 0;
+
+ file_offset -= dip->logical_offset;
+ file_offset >>= inode->i_sb->s_blocksize_bits;
+ io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
+
+ return 0;
+}
+
static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
int rw, u64 file_offset, int skip_sum,
int async_submit)
@@ -7353,7 +7721,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
bio_get(bio);
if (!write) {
- ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio,
+ BTRFS_WQ_ENDIO_DATA);
if (ret)
goto err;
}
@@ -7376,13 +7745,12 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
if (ret)
goto err;
- } else if (!skip_sum) {
- ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio,
- file_offset);
+ } else {
+ ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio,
+ file_offset);
if (ret)
goto err;
}
-
map:
ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
err:
@@ -7403,7 +7771,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
u64 submit_len = 0;
u64 map_length;
int nr_pages = 0;
- int ret = 0;
+ int ret;
int async_submit = 0;
map_length = orig_bio->bi_iter.bi_size;
@@ -7414,6 +7782,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
if (map_length >= orig_bio->bi_iter.bi_size) {
bio = orig_bio;
+ dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
goto submit;
}
@@ -7430,12 +7799,13 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
+ btrfs_io_bio(bio)->logical = file_offset;
atomic_inc(&dip->pending_bios);
while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
- if (unlikely(map_length < submit_len + bvec->bv_len ||
+ if (map_length < submit_len + bvec->bv_len ||
bio_add_page(bio, bvec->bv_page, bvec->bv_len,
- bvec->bv_offset) < bvec->bv_len)) {
+ bvec->bv_offset) < bvec->bv_len) {
/*
* inc the count before we submit the bio so
* we know the end IO handler won't happen before
@@ -7464,6 +7834,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
goto out_err;
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
+ btrfs_io_bio(bio)->logical = file_offset;
map_length = orig_bio->bi_iter.bi_size;
ret = btrfs_map_block(root->fs_info, rw,
@@ -7507,11 +7878,10 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_dio_private *dip;
struct bio *io_bio;
+ struct btrfs_io_bio *btrfs_bio;
int skip_sum;
- int sum_len;
int write = rw & REQ_WRITE;
int ret = 0;
- u16 csum_size;
skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
@@ -7521,16 +7891,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
goto free_ordered;
}
- if (!skip_sum && !write) {
- csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
- sum_len = dio_bio->bi_iter.bi_size >>
- inode->i_sb->s_blocksize_bits;
- sum_len *= csum_size;
- } else {
- sum_len = 0;
- }
-
- dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS);
+ dip = kzalloc(sizeof(*dip), GFP_NOFS);
if (!dip) {
ret = -ENOMEM;
goto free_io_bio;
@@ -7542,20 +7903,25 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
dip->bytes = dio_bio->bi_iter.bi_size;
dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
io_bio->bi_private = dip;
- dip->errors = 0;
dip->orig_bio = io_bio;
dip->dio_bio = dio_bio;
atomic_set(&dip->pending_bios, 0);
+ btrfs_bio = btrfs_io_bio(io_bio);
+ btrfs_bio->logical = file_offset;
- if (write)
+ if (write) {
io_bio->bi_end_io = btrfs_endio_direct_write;
- else
+ } else {
io_bio->bi_end_io = btrfs_endio_direct_read;
+ dip->subio_endio = btrfs_subio_endio_read;
+ }
ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
if (!ret)
return;
+ if (btrfs_bio->end_io)
+ btrfs_bio->end_io(btrfs_bio, ret);
free_io_bio:
bio_put(io_bio);
@@ -7652,8 +8018,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
ret = btrfs_delalloc_reserve_space(inode, count);
if (ret)
goto out;
- } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
- &BTRFS_I(inode)->runtime_flags))) {
+ } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+ &BTRFS_I(inode)->runtime_flags)) {
inode_dio_done(inode);
flags = DIO_LOCKING | DIO_SKIP_HOLES;
wakeup = false;
@@ -8173,6 +8539,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->last_sub_trans = 0;
ei->logged_trans = 0;
ei->delalloc_bytes = 0;
+ ei->defrag_bytes = 0;
ei->disk_i_size = 0;
ei->flags = 0;
ei->csum_bytes = 0;
@@ -8231,6 +8598,7 @@ void btrfs_destroy_inode(struct inode *inode)
WARN_ON(BTRFS_I(inode)->reserved_extents);
WARN_ON(BTRFS_I(inode)->delalloc_bytes);
WARN_ON(BTRFS_I(inode)->csum_bytes);
+ WARN_ON(BTRFS_I(inode)->defrag_bytes);
/*
* This can happen where we create an inode, but somebody else also
@@ -8646,7 +9014,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
spin_unlock(&root->delalloc_lock);
work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
- if (unlikely(!work)) {
+ if (!work) {
if (delay_iput)
btrfs_add_delayed_iput(inode);
else
@@ -8832,7 +9200,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
}
key.objectid = btrfs_ino(inode);
key.offset = 0;
- btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+ key.type = BTRFS_EXTENT_DATA_KEY;
datasize = btrfs_file_extent_calc_inline_size(name_len);
err = btrfs_insert_empty_item(trans, root, path, &key,
datasize);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8a8e29878c34..0fe1aa047f15 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -332,6 +332,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
goto out_drop;
} else {
+ ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
+ if (ret && ret != -ENODATA)
+ goto out_drop;
ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
}
@@ -477,8 +480,7 @@ static noinline int create_subvol(struct inode *dir,
if (ret)
goto fail;
- leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
- 0, objectid, NULL, 0, 0, 0);
+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
goto fail;
@@ -503,7 +505,7 @@ static noinline int create_subvol(struct inode *dir,
btrfs_set_stack_inode_generation(inode_item, 1);
btrfs_set_stack_inode_size(inode_item, 3);
btrfs_set_stack_inode_nlink(inode_item, 1);
- btrfs_set_stack_inode_nbytes(inode_item, root->leafsize);
+ btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
btrfs_set_root_flags(&root_item, 0);
@@ -535,7 +537,7 @@ static noinline int create_subvol(struct inode *dir,
key.objectid = objectid;
key.offset = 0;
- btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ key.type = BTRFS_ROOT_ITEM_KEY;
ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
&root_item);
if (ret)
@@ -882,7 +884,7 @@ out_unlock:
* file you want to defrag, we return 0 to let you know to skip this
* part of the file
*/
-static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh)
+static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh)
{
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_map *em = NULL;
@@ -917,7 +919,7 @@ static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh)
*/
static int find_new_extents(struct btrfs_root *root,
struct inode *inode, u64 newer_than,
- u64 *off, int thresh)
+ u64 *off, u32 thresh)
{
struct btrfs_path *path;
struct btrfs_key min_key;
@@ -936,12 +938,9 @@ static int find_new_extents(struct btrfs_root *root,
min_key.offset = *off;
while (1) {
- path->keep_locks = 1;
ret = btrfs_search_forward(root, &min_key, path, newer_than);
if (ret != 0)
goto none;
- path->keep_locks = 0;
- btrfs_unlock_up_safe(path, 1);
process_slot:
if (min_key.objectid != ino)
goto none;
@@ -1029,7 +1028,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
return ret;
}
-static int should_defrag_range(struct inode *inode, u64 start, int thresh,
+static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
u64 *last_len, u64 *skip, u64 *defrag_end,
int compress)
{
@@ -1259,7 +1258,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
int ret;
int defrag_count = 0;
int compress_type = BTRFS_COMPRESS_ZLIB;
- int extent_thresh = range->extent_thresh;
+ u32 extent_thresh = range->extent_thresh;
unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
unsigned long cluster = max_cluster;
u64 new_align = ~((u64)128 * 1024 - 1);
@@ -1335,8 +1334,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
inode->i_mapping->writeback_index = i;
while (i <= last_index && defrag_count < max_to_defrag &&
- (i < (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT)) {
+ (i < DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE))) {
/*
* make sure we stop running if someone unmounts
* the FS
@@ -1359,7 +1357,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
* the should_defrag function tells us how much to skip
* bump our counter by the suggested amount
*/
- next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ next = DIV_ROUND_UP(skip, PAGE_CACHE_SIZE);
i = max(i + 1, next);
continue;
}
@@ -1554,7 +1552,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
goto out_free;
}
- old_size = device->total_bytes;
+ old_size = btrfs_device_get_total_bytes(device);
if (mod < 0) {
if (new_size > old_size) {
@@ -2089,8 +2087,6 @@ static noinline int search_ioctl(struct inode *inode,
key.type = sk->min_type;
key.offset = sk->min_offset;
- path->keep_locks = 1;
-
while (1) {
ret = btrfs_search_forward(root, &key, path, sk->min_transid);
if (ret != 0) {
@@ -2423,9 +2419,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto out_dput;
}
- err = d_invalidate(dentry);
- if (err)
- goto out_unlock;
+ d_invalidate(dentry);
down_write(&root->fs_info->subvol_sem);
@@ -2510,7 +2504,6 @@ out_release:
btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
out_up_write:
up_write(&root->fs_info->subvol_sem);
-out_unlock:
if (err) {
spin_lock(&dest->root_item_lock);
root_flags = btrfs_root_flags(&dest->root_item);
@@ -2526,9 +2519,9 @@ out_unlock:
ASSERT(dest->send_in_progress == 0);
/* the last ref */
- if (dest->cache_inode) {
- iput(dest->cache_inode);
- dest->cache_inode = NULL;
+ if (dest->ino_cache_inode) {
+ iput(dest->ino_cache_inode);
+ dest->ino_cache_inode = NULL;
}
}
out_dput:
@@ -2634,6 +2627,9 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
ret = btrfs_init_new_device(root, vol_args->name);
+ if (!ret)
+ btrfs_info(root->fs_info, "disk added %s",vol_args->name);
+
kfree(vol_args);
out:
mutex_unlock(&root->fs_info->volume_mutex);
@@ -2673,6 +2669,9 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
mutex_unlock(&root->fs_info->volume_mutex);
atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ if (!ret)
+ btrfs_info(root->fs_info, "disk deleted %s",vol_args->name);
+
out:
kfree(vol_args);
err_drop:
@@ -2737,8 +2736,8 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
}
di_args->devid = dev->devid;
- di_args->bytes_used = dev->bytes_used;
- di_args->total_bytes = dev->total_bytes;
+ di_args->bytes_used = btrfs_device_get_bytes_used(dev);
+ di_args->total_bytes = btrfs_device_get_total_bytes(dev);
memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
if (dev->name) {
struct rcu_string *name;
@@ -3164,7 +3163,7 @@ static void clone_update_extent_map(struct inode *inode,
em->start + em->len - 1, 0);
}
- if (unlikely(ret))
+ if (ret)
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
}
@@ -3199,7 +3198,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
u64 last_dest_end = destoff;
ret = -ENOMEM;
- buf = vmalloc(btrfs_level_size(root, 0));
+ buf = vmalloc(root->nodesize);
if (!buf)
return ret;
@@ -3252,11 +3251,11 @@ process_slot:
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
- if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
+ if (key.type > BTRFS_EXTENT_DATA_KEY ||
key.objectid != btrfs_ino(src))
break;
- if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+ if (key.type == BTRFS_EXTENT_DATA_KEY) {
struct btrfs_file_extent_item *extent;
int type;
u32 size;
@@ -5283,6 +5282,12 @@ long btrfs_ioctl(struct file *file, unsigned int
if (ret)
return ret;
ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
+ /*
+ * The transaction thread may want to do more work,
+ * namely it pokes the cleaner ktread that will start
+ * processing uncleaned subvols.
+ */
+ wake_up_process(root->fs_info->transaction_kthread);
return ret;
}
case BTRFS_IOC_START_SYNC:
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index dfad8514f0da..78285f30909e 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -266,8 +266,7 @@ static int lzo_decompress_biovec(struct list_head *ws,
char *data_in;
unsigned long page_in_index = 0;
unsigned long page_out_index = 0;
- unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
- PAGE_CACHE_SIZE;
+ unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE);
unsigned long buf_start;
unsigned long buf_offset = 0;
unsigned long bytes;
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 65793edb38ca..47767d5b8f0b 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -27,7 +27,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
int ret = 0;
key.objectid = BTRFS_ORPHAN_OBJECTID;
- btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = offset;
path = btrfs_alloc_path();
@@ -48,7 +48,7 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
int ret = 0;
key.objectid = BTRFS_ORPHAN_OBJECTID;
- btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = offset;
path = btrfs_alloc_path();
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 9626b4ad3b9a..647ab12fdf5d 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -195,7 +195,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
for (i = 0 ; i < nr ; i++) {
item = btrfs_item_nr(i);
btrfs_item_key_to_cpu(l, &key, i);
- type = btrfs_key_type(&key);
+ type = key.type;
printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d "
"itemsize %d\n",
i, key.objectid, type, key.offset,
@@ -336,7 +336,6 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
for (i = 0; i < nr; i++) {
struct extent_buffer *next = read_tree_block(root,
btrfs_node_blockptr(c, i),
- btrfs_level_size(root, level - 1),
btrfs_node_ptr_generation(c, i));
if (btrfs_is_leaf(next) &&
level != 1)
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index ded5c601d916..48b60dbf807f 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -539,10 +539,9 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_key key;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &quota_root->state)))
+ if (btrfs_test_is_dummy_root(quota_root))
return 0;
-#endif
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -551,9 +550,15 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
key.type = BTRFS_QGROUP_INFO_KEY;
key.offset = qgroupid;
+ /*
+ * Avoid a transaction abort by catching -EEXIST here. In that
+ * case, we proceed by re-initializing the existing structure
+ * on disk.
+ */
+
ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
sizeof(*qgroup_info));
- if (ret)
+ if (ret && ret != -EEXIST)
goto out;
leaf = path->nodes[0];
@@ -572,7 +577,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
key.type = BTRFS_QGROUP_LIMIT_KEY;
ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
sizeof(*qgroup_limit));
- if (ret)
+ if (ret && ret != -EEXIST)
goto out;
leaf = path->nodes[0];
@@ -692,10 +697,9 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
int ret;
int slot;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ if (btrfs_test_is_dummy_root(root))
return 0;
-#endif
+
key.objectid = 0;
key.type = BTRFS_QGROUP_INFO_KEY;
key.offset = qgroup->qgroupid;
@@ -1335,6 +1339,8 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
INIT_LIST_HEAD(&oper->elem.list);
oper->elem.seq = 0;
+ trace_btrfs_qgroup_record_ref(oper);
+
if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) {
/*
* If any operation for this bytenr/ref_root combo
@@ -2077,6 +2083,8 @@ static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
ASSERT(is_fstree(oper->ref_root));
+ trace_btrfs_qgroup_account(oper);
+
switch (oper->type) {
case BTRFS_QGROUP_OPER_ADD_EXCL:
case BTRFS_QGROUP_OPER_SUB_EXCL:
@@ -2237,7 +2245,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
if (srcid) {
struct btrfs_root *srcroot;
struct btrfs_key srckey;
- int srcroot_level;
srckey.objectid = srcid;
srckey.type = BTRFS_ROOT_ITEM_KEY;
@@ -2249,8 +2256,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
}
rcu_read_lock();
- srcroot_level = btrfs_header_level(srcroot->node);
- level_size = btrfs_level_size(srcroot, srcroot_level);
+ level_size = srcroot->nodesize;
rcu_read_unlock();
}
@@ -2566,7 +2572,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
found.type != BTRFS_METADATA_ITEM_KEY)
continue;
if (found.type == BTRFS_METADATA_ITEM_KEY)
- num_bytes = fs_info->extent_root->leafsize;
+ num_bytes = fs_info->extent_root->nodesize;
else
num_bytes = found.offset;
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 0a6b6e4bcbb9..6a41631cb959 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -912,7 +912,7 @@ static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
{
unsigned long nr = stripe_len * nr_stripes;
- return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE);
}
/*
@@ -1442,7 +1442,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
struct btrfs_bio *bbio = rbio->bbio;
struct bio_list bio_list;
int ret;
- int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
int pagenr;
int stripe;
struct bio *bio;
@@ -1725,7 +1725,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
int pagenr, stripe;
void **pointers;
int faila = -1, failb = -1;
- int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
struct page *page;
int err;
int i;
@@ -1940,7 +1940,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
struct btrfs_bio *bbio = rbio->bbio;
struct bio_list bio_list;
int ret;
- int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
int pagenr;
int stripe;
struct bio *bio;
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 20408c6b665a..b63ae20618fb 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -347,7 +347,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
if (!re)
return NULL;
- blocksize = btrfs_level_size(root, level);
+ blocksize = root->nodesize;
re->logical = logical;
re->blocksize = blocksize;
re->top = *top;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 65245a07275b..74257d6436ad 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -736,7 +736,8 @@ again:
err = ret;
goto out;
}
- BUG_ON(!ret || !path1->slots[0]);
+ ASSERT(ret);
+ ASSERT(path1->slots[0]);
path1->slots[0]--;
@@ -746,10 +747,10 @@ again:
* the backref was added previously when processing
* backref of type BTRFS_TREE_BLOCK_REF_KEY
*/
- BUG_ON(!list_is_singular(&cur->upper));
+ ASSERT(list_is_singular(&cur->upper));
edge = list_entry(cur->upper.next, struct backref_edge,
list[LOWER]);
- BUG_ON(!list_empty(&edge->list[UPPER]));
+ ASSERT(list_empty(&edge->list[UPPER]));
exist = edge->node[UPPER];
/*
* add the upper level block to pending list if we need
@@ -831,7 +832,7 @@ again:
cur->cowonly = 1;
}
#else
- BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
+ ASSERT(key.type != BTRFS_EXTENT_REF_V0_KEY);
if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
#endif
if (key.objectid == key.offset) {
@@ -840,7 +841,7 @@ again:
* backref of this type.
*/
root = find_reloc_root(rc, cur->bytenr);
- BUG_ON(!root);
+ ASSERT(root);
cur->root = root;
break;
}
@@ -868,7 +869,7 @@ again:
} else {
upper = rb_entry(rb_node, struct backref_node,
rb_node);
- BUG_ON(!upper->checked);
+ ASSERT(upper->checked);
INIT_LIST_HEAD(&edge->list[UPPER]);
}
list_add_tail(&edge->list[LOWER], &cur->upper);
@@ -892,7 +893,7 @@ again:
if (btrfs_root_level(&root->root_item) == cur->level) {
/* tree root */
- BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+ ASSERT(btrfs_root_bytenr(&root->root_item) ==
cur->bytenr);
if (should_ignore_root(root))
list_add(&cur->list, &useless);
@@ -927,7 +928,7 @@ again:
need_check = true;
for (; level < BTRFS_MAX_LEVEL; level++) {
if (!path2->nodes[level]) {
- BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+ ASSERT(btrfs_root_bytenr(&root->root_item) ==
lower->bytenr);
if (should_ignore_root(root))
list_add(&lower->list, &useless);
@@ -977,12 +978,15 @@ again:
need_check = false;
list_add_tail(&edge->list[UPPER],
&list);
- } else
+ } else {
+ if (upper->checked)
+ need_check = true;
INIT_LIST_HEAD(&edge->list[UPPER]);
+ }
} else {
upper = rb_entry(rb_node, struct backref_node,
rb_node);
- BUG_ON(!upper->checked);
+ ASSERT(upper->checked);
INIT_LIST_HEAD(&edge->list[UPPER]);
if (!upper->owner)
upper->owner = btrfs_header_owner(eb);
@@ -1026,7 +1030,7 @@ next:
* everything goes well, connect backref nodes and insert backref nodes
* into the cache.
*/
- BUG_ON(!node->checked);
+ ASSERT(node->checked);
cowonly = node->cowonly;
if (!cowonly) {
rb_node = tree_insert(&cache->rb_root, node->bytenr,
@@ -1062,8 +1066,21 @@ next:
continue;
}
- BUG_ON(!upper->checked);
- BUG_ON(cowonly != upper->cowonly);
+ if (!upper->checked) {
+ /*
+ * Still want to blow up for developers since this is a
+ * logic bug.
+ */
+ ASSERT(0);
+ err = -EINVAL;
+ goto out;
+ }
+ if (cowonly != upper->cowonly) {
+ ASSERT(0);
+ err = -EINVAL;
+ goto out;
+ }
+
if (!cowonly) {
rb_node = tree_insert(&cache->rb_root, upper->bytenr,
&upper->rb_node);
@@ -1086,7 +1103,7 @@ next:
while (!list_empty(&useless)) {
upper = list_entry(useless.next, struct backref_node, list);
list_del_init(&upper->list);
- BUG_ON(!list_empty(&upper->upper));
+ ASSERT(list_empty(&upper->upper));
if (upper == node)
node = NULL;
if (upper->lowest) {
@@ -1119,29 +1136,45 @@ out:
if (err) {
while (!list_empty(&useless)) {
lower = list_entry(useless.next,
- struct backref_node, upper);
- list_del_init(&lower->upper);
+ struct backref_node, list);
+ list_del_init(&lower->list);
}
- upper = node;
- INIT_LIST_HEAD(&list);
- while (upper) {
- if (RB_EMPTY_NODE(&upper->rb_node)) {
- list_splice_tail(&upper->upper, &list);
- free_backref_node(cache, upper);
- }
-
- if (list_empty(&list))
- break;
-
- edge = list_entry(list.next, struct backref_edge,
- list[LOWER]);
+ while (!list_empty(&list)) {
+ edge = list_first_entry(&list, struct backref_edge,
+ list[UPPER]);
+ list_del(&edge->list[UPPER]);
list_del(&edge->list[LOWER]);
+ lower = edge->node[LOWER];
upper = edge->node[UPPER];
free_backref_edge(cache, edge);
+
+ /*
+ * Lower is no longer linked to any upper backref nodes
+ * and isn't in the cache, we can free it ourselves.
+ */
+ if (list_empty(&lower->upper) &&
+ RB_EMPTY_NODE(&lower->rb_node))
+ list_add(&lower->list, &useless);
+
+ if (!RB_EMPTY_NODE(&upper->rb_node))
+ continue;
+
+ /* Add this guy's upper edges to the list to proces */
+ list_for_each_entry(edge, &upper->upper, list[LOWER])
+ list_add_tail(&edge->list[UPPER], &list);
+ if (list_empty(&upper->upper))
+ list_add(&upper->list, &useless);
+ }
+
+ while (!list_empty(&useless)) {
+ lower = list_entry(useless.next,
+ struct backref_node, list);
+ list_del_init(&lower->list);
+ free_backref_node(cache, lower);
}
return ERR_PTR(err);
}
- BUG_ON(node && node->detached);
+ ASSERT(!node || !node->detached);
return node;
}
@@ -1787,7 +1820,7 @@ again:
btrfs_node_key_to_cpu(parent, next_key, slot + 1);
old_bytenr = btrfs_node_blockptr(parent, slot);
- blocksize = btrfs_level_size(dest, level - 1);
+ blocksize = dest->nodesize;
old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
if (level <= max_level) {
@@ -1813,8 +1846,7 @@ again:
break;
}
- eb = read_tree_block(dest, old_bytenr, blocksize,
- old_ptr_gen);
+ eb = read_tree_block(dest, old_bytenr, old_ptr_gen);
if (!eb || !extent_buffer_uptodate(eb)) {
ret = (!eb) ? -ENOMEM : -EIO;
free_extent_buffer(eb);
@@ -1944,7 +1976,6 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
u64 bytenr;
u64 ptr_gen = 0;
u64 last_snapshot;
- u32 blocksize;
u32 nritems;
last_snapshot = btrfs_root_last_snapshot(&root->root_item);
@@ -1970,8 +2001,7 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
}
bytenr = btrfs_node_blockptr(eb, path->slots[i]);
- blocksize = btrfs_level_size(root, i - 1);
- eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
+ eb = read_tree_block(root, bytenr, ptr_gen);
if (!eb || !extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
@@ -2316,7 +2346,7 @@ void free_reloc_roots(struct list_head *list)
}
static noinline_for_stack
-int merge_reloc_roots(struct reloc_control *rc)
+void merge_reloc_roots(struct reloc_control *rc)
{
struct btrfs_root *root;
struct btrfs_root *reloc_root;
@@ -2397,7 +2427,6 @@ out:
}
BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
- return ret;
}
static void free_block_list(struct rb_root *blocks)
@@ -2544,8 +2573,7 @@ u64 calcu_metadata_size(struct reloc_control *rc,
if (next->processed && (reserve || next != node))
break;
- num_bytes += btrfs_level_size(rc->extent_root,
- next->level);
+ num_bytes += rc->extent_root->nodesize;
if (list_empty(&next->upper))
break;
@@ -2679,9 +2707,9 @@ static int do_relocation(struct btrfs_trans_handle *trans,
goto next;
}
- blocksize = btrfs_level_size(root, node->level);
+ blocksize = root->nodesize;
generation = btrfs_node_ptr_generation(upper->eb, slot);
- eb = read_tree_block(root, bytenr, blocksize, generation);
+ eb = read_tree_block(root, bytenr, generation);
if (!eb || !extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
err = -EIO;
@@ -2789,7 +2817,7 @@ static void __mark_block_processed(struct reloc_control *rc,
u32 blocksize;
if (node->level == 0 ||
in_block_group(node->bytenr, rc->block_group)) {
- blocksize = btrfs_level_size(rc->extent_root, node->level);
+ blocksize = rc->extent_root->nodesize;
mark_block_processed(rc, node->bytenr, blocksize);
}
node->processed = 1;
@@ -2843,7 +2871,7 @@ static int get_tree_block_key(struct reloc_control *rc,
BUG_ON(block->key_ready);
eb = read_tree_block(rc->extent_root, block->bytenr,
- block->key.objectid, block->key.offset);
+ block->key.offset);
if (!eb || !extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
@@ -2858,20 +2886,6 @@ static int get_tree_block_key(struct reloc_control *rc,
return 0;
}
-static int reada_tree_block(struct reloc_control *rc,
- struct tree_block *block)
-{
- BUG_ON(block->key_ready);
- if (block->key.type == BTRFS_METADATA_ITEM_KEY)
- readahead_tree_block(rc->extent_root, block->bytenr,
- block->key.objectid,
- rc->extent_root->leafsize);
- else
- readahead_tree_block(rc->extent_root, block->bytenr,
- block->key.objectid, block->key.offset);
- return 0;
-}
-
/*
* helper function to relocate a tree block
*/
@@ -2951,7 +2965,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
while (rb_node) {
block = rb_entry(rb_node, struct tree_block, rb_node);
if (!block->key_ready)
- reada_tree_block(rc, block);
+ readahead_tree_block(rc->extent_root, block->bytenr,
+ block->key.objectid);
rb_node = rb_next(rb_node);
}
@@ -3313,7 +3328,7 @@ static int add_tree_block(struct reloc_control *rc,
return -ENOMEM;
block->bytenr = extent_key->objectid;
- block->key.objectid = rc->extent_root->leafsize;
+ block->key.objectid = rc->extent_root->nodesize;
block->key.offset = generation;
block->level = level;
block->key_ready = 0;
@@ -3640,7 +3655,7 @@ int add_data_references(struct reloc_control *rc,
struct btrfs_extent_inline_ref *iref;
unsigned long ptr;
unsigned long end;
- u32 blocksize = btrfs_level_size(rc->extent_root, 0);
+ u32 blocksize = rc->extent_root->nodesize;
int ret = 0;
int err = 0;
@@ -3783,7 +3798,7 @@ next:
}
if (key.type == BTRFS_METADATA_ITEM_KEY &&
- key.objectid + rc->extent_root->leafsize <=
+ key.objectid + rc->extent_root->nodesize <=
rc->search_start) {
path->slots[0]++;
goto next;
@@ -3801,7 +3816,7 @@ next:
rc->search_start = key.objectid + key.offset;
else
rc->search_start = key.objectid +
- rc->extent_root->leafsize;
+ rc->extent_root->nodesize;
memcpy(extent_key, &key, sizeof(key));
return 0;
}
@@ -4096,7 +4111,6 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
BTRFS_INODE_PREALLOC);
btrfs_mark_buffer_dirty(leaf);
- btrfs_release_path(path);
out:
btrfs_free_path(path);
return ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f4a41f37be22..efa083113827 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -137,7 +137,6 @@ struct scrub_ctx {
int pages_per_rd_bio;
u32 sectorsize;
u32 nodesize;
- u32 leafsize;
int is_dev_replace;
struct scrub_wr_ctx wr_ctx;
@@ -178,17 +177,12 @@ struct scrub_copy_nocow_ctx {
struct scrub_warning {
struct btrfs_path *path;
u64 extent_item_size;
- char *scratch_buf;
- char *msg_buf;
const char *errstr;
sector_t sector;
u64 logical;
struct btrfs_device *dev;
- int msg_bufsize;
- int scratch_bufsize;
};
-
static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
@@ -438,7 +432,6 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
}
sctx->first_free = 0;
sctx->nodesize = dev->dev_root->nodesize;
- sctx->leafsize = dev->dev_root->leafsize;
sctx->sectorsize = dev->dev_root->sectorsize;
atomic_set(&sctx->bios_in_flight, 0);
atomic_set(&sctx->workers_pending, 0);
@@ -553,7 +546,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
u64 ref_root;
u32 item_size;
u8 ref_level;
- const int bufsize = 4096;
int ret;
WARN_ON(sblock->page_count < 1);
@@ -561,18 +553,13 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
fs_info = sblock->sctx->dev_root->fs_info;
path = btrfs_alloc_path();
+ if (!path)
+ return;
- swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
- swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
swarn.sector = (sblock->pagev[0]->physical) >> 9;
swarn.logical = sblock->pagev[0]->logical;
swarn.errstr = errstr;
swarn.dev = NULL;
- swarn.msg_bufsize = bufsize;
- swarn.scratch_bufsize = bufsize;
-
- if (!path || !swarn.scratch_buf || !swarn.msg_buf)
- goto out;
ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
&flags);
@@ -613,8 +600,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
out:
btrfs_free_path(path);
- kfree(swarn.scratch_buf);
- kfree(swarn.msg_buf);
}
static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
@@ -681,9 +666,9 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
ret = -EIO;
goto out;
}
- fs_info = BTRFS_I(inode)->root->fs_info;
- ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
+ ret = repair_io_failure(inode, offset, PAGE_SIZE,
fixup->logical, page,
+ offset - page_offset(page),
fixup->mirror_num);
unlock_page(page);
corrected = !ret;
@@ -1361,6 +1346,16 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
return;
}
+static inline int scrub_check_fsid(u8 fsid[],
+ struct scrub_page *spage)
+{
+ struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
+ int ret;
+
+ ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE);
+ return !ret;
+}
+
static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock,
int is_metadata, int have_csum,
@@ -1380,7 +1375,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
h = (struct btrfs_header *)mapped_buffer;
if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
- memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
+ !scrub_check_fsid(h->fsid, sblock->pagev[0]) ||
memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
BTRFS_UUID_SIZE)) {
sblock->header_error = 1;
@@ -1751,14 +1746,13 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
++fail;
- if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
+ if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
++fail;
if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
BTRFS_UUID_SIZE))
++fail;
- WARN_ON(sctx->nodesize != sctx->leafsize);
len = sctx->nodesize - BTRFS_CSUM_SIZE;
mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
@@ -1791,8 +1785,6 @@ static int scrub_checksum_super(struct scrub_block *sblock)
{
struct btrfs_super_block *s;
struct scrub_ctx *sctx = sblock->sctx;
- struct btrfs_root *root = sctx->dev_root;
- struct btrfs_fs_info *fs_info = root->fs_info;
u8 calculated_csum[BTRFS_CSUM_SIZE];
u8 on_disk_csum[BTRFS_CSUM_SIZE];
struct page *page;
@@ -1817,7 +1809,7 @@ static int scrub_checksum_super(struct scrub_block *sblock)
if (sblock->pagev[0]->generation != btrfs_super_generation(s))
++fail_gen;
- if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
+ if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
++fail_cor;
len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
@@ -2196,7 +2188,6 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
sctx->stat.data_bytes_scrubbed += len;
spin_unlock(&sctx->stat_lock);
} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- WARN_ON(sctx->nodesize != sctx->leafsize);
blocksize = sctx->nodesize;
spin_lock(&sctx->stat_lock);
sctx->stat.tree_extents_scrubbed++;
@@ -2487,7 +2478,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
btrfs_item_key_to_cpu(l, &key, slot);
if (key.type == BTRFS_METADATA_ITEM_KEY)
- bytes = root->leafsize;
+ bytes = root->nodesize;
else
bytes = key.offset;
@@ -2714,7 +2705,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (found_key.objectid != scrub_dev->devid)
break;
- if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
+ if (found_key.type != BTRFS_DEV_EXTENT_KEY)
break;
if (found_key.offset >= end)
@@ -2828,11 +2819,16 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
return -EIO;
- gen = root->fs_info->last_trans_committed;
+ /* Seed devices of a new filesystem has their own generation. */
+ if (scrub_dev->fs_devices != root->fs_info->fs_devices)
+ gen = scrub_dev->generation;
+ else
+ gen = root->fs_info->last_trans_committed;
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
bytenr = btrfs_sb_offset(i);
- if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >
+ scrub_dev->commit_total_bytes)
break;
ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
@@ -2910,17 +2906,6 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (btrfs_fs_closing(fs_info))
return -EINVAL;
- /*
- * check some assumptions
- */
- if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
- btrfs_err(fs_info,
- "scrub: size assumption nodesize == leafsize (%d == %d) fails",
- fs_info->chunk_root->nodesize,
- fs_info->chunk_root->leafsize);
- return -EINVAL;
- }
-
if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
/*
* in this case scrub is unable to calculate the checksum
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 6528aa662181..874828dd0a86 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -515,7 +515,8 @@ static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
set_fs(KERNEL_DS);
while (pos < len) {
- ret = vfs_write(filp, (char *)buf + pos, len - pos, off);
+ ret = vfs_write(filp, (__force const char __user *)buf + pos,
+ len - pos, off);
/* TODO handle that correctly */
/*if (ret == -ERESTARTSYS) {
continue;
@@ -985,11 +986,13 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
int num;
u8 type;
- if (found_key->type == BTRFS_XATTR_ITEM_KEY)
- buf_len = BTRFS_MAX_XATTR_SIZE(root);
- else
- buf_len = PATH_MAX;
-
+ /*
+ * Start with a small buffer (1 page). If later we end up needing more
+ * space, which can happen for xattrs on a fs with a leaf size greater
+ * then the page size, attempt to increase the buffer. Typically xattr
+ * values are small.
+ */
+ buf_len = PATH_MAX;
buf = kmalloc(buf_len, GFP_NOFS);
if (!buf) {
ret = -ENOMEM;
@@ -1016,7 +1019,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
ret = -ENAMETOOLONG;
goto out;
}
- if (name_len + data_len > buf_len) {
+ if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)) {
ret = -E2BIG;
goto out;
}
@@ -1024,12 +1027,34 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
/*
* Path too long
*/
- if (name_len + data_len > buf_len) {
+ if (name_len + data_len > PATH_MAX) {
ret = -ENAMETOOLONG;
goto out;
}
}
+ if (name_len + data_len > buf_len) {
+ buf_len = name_len + data_len;
+ if (is_vmalloc_addr(buf)) {
+ vfree(buf);
+ buf = NULL;
+ } else {
+ char *tmp = krealloc(buf, buf_len,
+ GFP_NOFS | __GFP_NOWARN);
+
+ if (!tmp)
+ kfree(buf);
+ buf = tmp;
+ }
+ if (!buf) {
+ buf = vmalloc(buf_len);
+ if (!buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ }
+
read_extent_buffer(eb, buf, (unsigned long)(di + 1),
name_len + data_len);
@@ -1050,7 +1075,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
}
out:
- kfree(buf);
+ kvfree(buf);
return ret;
}
@@ -3302,7 +3327,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
if (ret < 0 && ret != -ENOENT) {
goto out;
} else if (ret == -ENOENT) {
- ret = 1;
+ ret = 0;
break;
}
@@ -5703,7 +5728,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
NULL);
sort_clone_roots = 1;
- current->journal_info = (void *)BTRFS_SEND_TRANS_STUB;
+ current->journal_info = BTRFS_SEND_TRANS_STUB;
ret = send_subvol(sctx);
current->journal_info = NULL;
if (ret < 0)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c4124de4435b..a2b97ef10317 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -60,6 +60,7 @@
#include "backref.h"
#include "tests/btrfs-tests.h"
+#include "qgroup.h"
#define CREATE_TRACE_POINTS
#include <trace/events/btrfs.h>
@@ -307,13 +308,7 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
static void btrfs_put_super(struct super_block *sb)
{
- (void)close_ctree(btrfs_sb(sb)->tree_root);
- /* FIXME: need to fix VFS to return error? */
- /* AV: return it _where_? ->put_super() can be triggered by any number
- * of async events, up to and including delivery of SIGKILL to the
- * last process that kept it busy. Or segfault in the aforementioned
- * process... Whom would you report that to?
- */
+ close_ctree(btrfs_sb(sb)->tree_root);
}
enum {
@@ -400,7 +395,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
int ret = 0;
char *compress_type;
bool compress_force = false;
- bool compress = false;
cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
if (cache_gen)
@@ -478,7 +472,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
/* Fallthrough */
case Opt_compress:
case Opt_compress_type:
- compress = true;
if (token == Opt_compress ||
token == Opt_compress_force ||
strcmp(args[0].from, "zlib") == 0) {
@@ -508,11 +501,18 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
btrfs_set_and_info(root, FORCE_COMPRESS,
"force %s compression",
compress_type);
- } else if (compress) {
+ } else {
if (!btrfs_test_opt(root, COMPRESS))
btrfs_info(root->fs_info,
"btrfs: use %s compression",
compress_type);
+ /*
+ * If we remount from compress-force=xxx to
+ * compress=xxx, we need clear FORCE_COMPRESS
+ * flag, otherwise, there is no way for users
+ * to disable forcible compression separately.
+ */
+ btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
}
break;
case Opt_ssd:
@@ -1014,7 +1014,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",nodatacow");
if (btrfs_test_opt(root, NOBARRIER))
seq_puts(seq, ",nobarrier");
- if (info->max_inline != 8192 * 1024)
+ if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
seq_printf(seq, ",max_inline=%llu", info->max_inline);
if (info->alloc_start != 0)
seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
@@ -1215,6 +1215,56 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
return root;
}
+static int parse_security_options(char *orig_opts,
+ struct security_mnt_opts *sec_opts)
+{
+ char *secdata = NULL;
+ int ret = 0;
+
+ secdata = alloc_secdata();
+ if (!secdata)
+ return -ENOMEM;
+ ret = security_sb_copy_data(orig_opts, secdata);
+ if (ret) {
+ free_secdata(secdata);
+ return ret;
+ }
+ ret = security_sb_parse_opts_str(secdata, sec_opts);
+ free_secdata(secdata);
+ return ret;
+}
+
+static int setup_security_options(struct btrfs_fs_info *fs_info,
+ struct super_block *sb,
+ struct security_mnt_opts *sec_opts)
+{
+ int ret = 0;
+
+ /*
+ * Call security_sb_set_mnt_opts() to check whether new sec_opts
+ * is valid.
+ */
+ ret = security_sb_set_mnt_opts(sb, sec_opts, 0, NULL);
+ if (ret)
+ return ret;
+
+#ifdef CONFIG_SECURITY
+ if (!fs_info->security_opts.num_mnt_opts) {
+ /* first time security setup, copy sec_opts to fs_info */
+ memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts));
+ } else {
+ /*
+ * Since SELinux(the only one supports security_mnt_opts) does
+ * NOT support changing context during remount/mount same sb,
+ * This must be the same or part of the same security options,
+ * just free it.
+ */
+ security_free_mnt_opts(sec_opts);
+ }
+#endif
+ return ret;
+}
+
/*
* Find a superblock for the given device / mount point.
*
@@ -1229,6 +1279,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
struct dentry *root;
struct btrfs_fs_devices *fs_devices = NULL;
struct btrfs_fs_info *fs_info = NULL;
+ struct security_mnt_opts new_sec_opts;
fmode_t mode = FMODE_READ;
char *subvol_name = NULL;
u64 subvol_objectid = 0;
@@ -1251,9 +1302,16 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
return root;
}
+ security_init_mnt_opts(&new_sec_opts);
+ if (data) {
+ error = parse_security_options(data, &new_sec_opts);
+ if (error)
+ return ERR_PTR(error);
+ }
+
error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
if (error)
- return ERR_PTR(error);
+ goto error_sec_opts;
/*
* Setup a dummy root and fs_info for test/set super. This is because
@@ -1262,13 +1320,16 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
* then open_ctree will properly initialize everything later.
*/
fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
- if (!fs_info)
- return ERR_PTR(-ENOMEM);
+ if (!fs_info) {
+ error = -ENOMEM;
+ goto error_sec_opts;
+ }
fs_info->fs_devices = fs_devices;
fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
+ security_init_mnt_opts(&fs_info->security_opts);
if (!fs_info->super_copy || !fs_info->super_for_commit) {
error = -ENOMEM;
goto error_fs_info;
@@ -1306,8 +1367,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
}
root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
- if (IS_ERR(root))
+ if (IS_ERR(root)) {
+ deactivate_locked_super(s);
+ error = PTR_ERR(root);
+ goto error_sec_opts;
+ }
+
+ fs_info = btrfs_sb(s);
+ error = setup_security_options(fs_info, s, &new_sec_opts);
+ if (error) {
+ dput(root);
deactivate_locked_super(s);
+ goto error_sec_opts;
+ }
return root;
@@ -1315,6 +1387,8 @@ error_close_devices:
btrfs_close_devices(fs_devices);
error_fs_info:
free_fs_info(fs_info);
+error_sec_opts:
+ security_free_mnt_opts(&new_sec_opts);
return ERR_PTR(error);
}
@@ -1396,6 +1470,21 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
sync_filesystem(sb);
btrfs_remount_prepare(fs_info);
+ if (data) {
+ struct security_mnt_opts new_sec_opts;
+
+ security_init_mnt_opts(&new_sec_opts);
+ ret = parse_security_options(data, &new_sec_opts);
+ if (ret)
+ goto restore;
+ ret = setup_security_options(fs_info, sb,
+ &new_sec_opts);
+ if (ret) {
+ security_free_mnt_opts(&new_sec_opts);
+ goto restore;
+ }
+ }
+
ret = btrfs_parse_options(root, data);
if (ret) {
ret = -EINVAL;
@@ -1694,7 +1783,11 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
int ret;
- /* holding chunk_muext to avoid allocating new chunks */
+ /*
+ * holding chunk_muext to avoid allocating new chunks, holding
+ * device_list_mutex to avoid the device being removed
+ */
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
mutex_lock(&fs_info->chunk_mutex);
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
@@ -1735,11 +1828,13 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
if (ret) {
mutex_unlock(&fs_info->chunk_mutex);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
return ret;
}
buf->f_bavail += div_u64(total_free_data, factor);
buf->f_bavail = buf->f_bavail >> bits;
mutex_unlock(&fs_info->chunk_mutex);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
buf->f_type = BTRFS_SUPER_MAGIC;
buf->f_bsize = dentry->d_sb->s_blocksize;
@@ -1769,7 +1864,7 @@ static struct file_system_type btrfs_fs_type = {
.name = "btrfs",
.mount = btrfs_mount,
.kill_sb = btrfs_kill_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
};
MODULE_ALIAS_FS("btrfs");
@@ -1993,11 +2088,15 @@ static int __init init_btrfs_fs(void)
err = btrfs_prelim_ref_init();
if (err)
+ goto free_delayed_ref;
+
+ err = btrfs_end_io_wq_init();
+ if (err)
goto free_prelim_ref;
err = btrfs_interface_init();
if (err)
- goto free_delayed_ref;
+ goto free_end_io_wq;
btrfs_init_lockdep();
@@ -2015,6 +2114,8 @@ static int __init init_btrfs_fs(void)
unregister_ioctl:
btrfs_interface_exit();
+free_end_io_wq:
+ btrfs_end_io_wq_exit();
free_prelim_ref:
btrfs_prelim_ref_exit();
free_delayed_ref:
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 12e53556e214..b2e7bb4393f6 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -242,7 +242,7 @@ static ssize_t global_rsv_size_show(struct kobject *kobj,
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf);
}
-BTRFS_ATTR(global_rsv_size, 0444, global_rsv_size_show);
+BTRFS_ATTR(global_rsv_size, global_rsv_size_show);
static ssize_t global_rsv_reserved_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
@@ -251,7 +251,7 @@ static ssize_t global_rsv_reserved_show(struct kobject *kobj,
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf);
}
-BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show);
+BTRFS_ATTR(global_rsv_reserved, global_rsv_reserved_show);
#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj)
#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj)
@@ -306,7 +306,7 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \
struct btrfs_space_info *sinfo = to_space_info(kobj); \
return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \
} \
-BTRFS_ATTR(field, 0444, btrfs_space_info_show_##field)
+BTRFS_ATTR(field, btrfs_space_info_show_##field)
static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj,
struct kobj_attribute *a,
@@ -325,7 +325,7 @@ SPACE_INFO_ATTR(bytes_reserved);
SPACE_INFO_ATTR(bytes_may_use);
SPACE_INFO_ATTR(disk_used);
SPACE_INFO_ATTR(disk_total);
-BTRFS_ATTR(total_bytes_pinned, 0444, btrfs_space_info_show_total_bytes_pinned);
+BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned);
static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(flags),
@@ -363,7 +363,8 @@ static ssize_t btrfs_label_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%s\n", fs_info->super_copy->label);
+ char *label = fs_info->super_copy->label;
+ return snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
}
static ssize_t btrfs_label_store(struct kobject *kobj,
@@ -374,8 +375,18 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
struct btrfs_trans_handle *trans;
struct btrfs_root *root = fs_info->fs_root;
int ret;
+ size_t p_len;
- if (len >= BTRFS_LABEL_SIZE)
+ if (fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ /*
+ * p_len is the len until the first occurrence of either
+ * '\n' or '\0'
+ */
+ p_len = strcspn(buf, "\n");
+
+ if (p_len >= BTRFS_LABEL_SIZE)
return -EINVAL;
trans = btrfs_start_transaction(root, 0);
@@ -383,7 +394,8 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
return PTR_ERR(trans);
spin_lock(&root->fs_info->super_lock);
- strcpy(fs_info->super_copy->label, buf);
+ memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE);
+ memcpy(fs_info->super_copy->label, buf, p_len);
spin_unlock(&root->fs_info->super_lock);
ret = btrfs_commit_transaction(trans, root);
@@ -392,14 +404,7 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
return ret;
}
-BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store);
-
-static ssize_t btrfs_no_store(struct kobject *kobj,
- struct kobj_attribute *a,
- const char *buf, size_t len)
-{
- return -EPERM;
-}
+BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store);
static ssize_t btrfs_nodesize_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
@@ -409,7 +414,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,
return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
}
-BTRFS_ATTR_RW(nodesize, 0444, btrfs_nodesize_show, btrfs_no_store);
+BTRFS_ATTR(nodesize, btrfs_nodesize_show);
static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
@@ -419,7 +424,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
}
-BTRFS_ATTR_RW(sectorsize, 0444, btrfs_sectorsize_show, btrfs_no_store);
+BTRFS_ATTR(sectorsize, btrfs_sectorsize_show);
static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
@@ -429,7 +434,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
}
-BTRFS_ATTR_RW(clone_alignment, 0444, btrfs_clone_alignment_show, btrfs_no_store);
+BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show);
static struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(label),
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index ac46df37504c..f7dd298b3cf6 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -20,16 +20,20 @@ enum btrfs_feature_set {
.store = _store, \
}
-#define BTRFS_ATTR_RW(_name, _mode, _show, _store) \
-static struct kobj_attribute btrfs_attr_##_name = \
- __INIT_KOBJ_ATTR(_name, _mode, _show, _store)
-#define BTRFS_ATTR(_name, _mode, _show) \
- BTRFS_ATTR_RW(_name, _mode, _show, NULL)
+#define BTRFS_ATTR_RW(_name, _show, _store) \
+ static struct kobj_attribute btrfs_attr_##_name = \
+ __INIT_KOBJ_ATTR(_name, 0644, _show, _store)
+
+#define BTRFS_ATTR(_name, _show) \
+ static struct kobj_attribute btrfs_attr_##_name = \
+ __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
+
#define BTRFS_ATTR_PTR(_name) (&btrfs_attr_##_name.attr)
#define BTRFS_RAID_ATTR(_name, _show) \
-static struct kobj_attribute btrfs_raid_attr_##_name = \
+ static struct kobj_attribute btrfs_raid_attr_##_name = \
__INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
+
#define BTRFS_RAID_ATTR_PTR(_name) (&btrfs_raid_attr_##_name.attr)
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index c8d9ddf84c69..2299bfde39ee 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -40,11 +40,12 @@ static struct btrfs_block_group_cache *init_test_block_group(void)
cache->key.offset = 1024 * 1024 * 1024;
cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
cache->sectorsize = 4096;
+ cache->full_stripe_len = 4096;
spin_lock_init(&cache->lock);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
- INIT_LIST_HEAD(&cache->new_bg_list);
+ INIT_LIST_HEAD(&cache->bg_list);
btrfs_init_free_space_ctl(cache);
@@ -364,6 +365,517 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
return 0;
}
+/* Used by test_steal_space_from_bitmap_to_extent(). */
+static bool test_use_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
+{
+ return ctl->free_extents > 0;
+}
+
+/* Used by test_steal_space_from_bitmap_to_extent(). */
+static int
+check_num_extents_and_bitmaps(const struct btrfs_block_group_cache *cache,
+ const int num_extents,
+ const int num_bitmaps)
+{
+ if (cache->free_space_ctl->free_extents != num_extents) {
+ test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n",
+ cache->free_space_ctl->free_extents, num_extents);
+ return -EINVAL;
+ }
+ if (cache->free_space_ctl->total_bitmaps != num_bitmaps) {
+ test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n",
+ cache->free_space_ctl->total_bitmaps, num_bitmaps);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* Used by test_steal_space_from_bitmap_to_extent(). */
+static int check_cache_empty(struct btrfs_block_group_cache *cache)
+{
+ u64 offset;
+ u64 max_extent_size;
+
+ /*
+ * Now lets confirm that there's absolutely no free space left to
+ * allocate.
+ */
+ if (cache->free_space_ctl->free_space != 0) {
+ test_msg("Cache free space is not 0\n");
+ return -EINVAL;
+ }
+
+ /* And any allocation request, no matter how small, should fail now. */
+ offset = btrfs_find_space_for_alloc(cache, 0, 4096, 0,
+ &max_extent_size);
+ if (offset != 0) {
+ test_msg("Space allocation did not fail, returned offset: %llu",
+ offset);
+ return -EINVAL;
+ }
+
+ /* And no extent nor bitmap entries in the cache anymore. */
+ return check_num_extents_and_bitmaps(cache, 0, 0);
+}
+
+/*
+ * Before we were able to steal free space from a bitmap entry to an extent
+ * entry, we could end up with 2 entries representing a contiguous free space.
+ * One would be an extent entry and the other a bitmap entry. Since in order
+ * to allocate space to a caller we use only 1 entry, we couldn't return that
+ * whole range to the caller if it was requested. This forced the caller to
+ * either assume ENOSPC or perform several smaller space allocations, which
+ * wasn't optimal as they could be spread all over the block group while under
+ * concurrency (extra overhead and fragmentation).
+ *
+ * This stealing approach is benefical, since we always prefer to allocate from
+ * extent entries, both for clustered and non-clustered allocation requests.
+ */
+static int
+test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
+{
+ int ret;
+ u64 offset;
+ u64 max_extent_size;
+
+ bool (*use_bitmap_op)(struct btrfs_free_space_ctl *,
+ struct btrfs_free_space *);
+
+ test_msg("Running space stealing from bitmap to extent\n");
+
+ /*
+ * For this test, we want to ensure we end up with an extent entry
+ * immediately adjacent to a bitmap entry, where the bitmap starts
+ * at an offset where the extent entry ends. We keep adding and
+ * removing free space to reach into this state, but to get there
+ * we need to reach a point where marking new free space doesn't
+ * result in adding new extent entries or merging the new space
+ * with existing extent entries - the space ends up being marked
+ * in an existing bitmap that covers the new free space range.
+ *
+ * To get there, we need to reach the threshold defined set at
+ * cache->free_space_ctl->extents_thresh, which currently is
+ * 256 extents on a x86_64 system at least, and a few other
+ * conditions (check free_space_cache.c). Instead of making the
+ * test much longer and complicated, use a "use_bitmap" operation
+ * that forces use of bitmaps as soon as we have at least 1
+ * extent entry.
+ */
+ use_bitmap_op = cache->free_space_ctl->op->use_bitmap;
+ cache->free_space_ctl->op->use_bitmap = test_use_bitmap;
+
+ /*
+ * Extent entry covering free space range [128Mb - 256Kb, 128Mb - 128Kb[
+ */
+ ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 - 256 * 1024,
+ 128 * 1024, 0);
+ if (ret) {
+ test_msg("Couldn't add extent entry %d\n", ret);
+ return ret;
+ }
+
+ /* Bitmap entry covering free space range [128Mb + 512Kb, 256Mb[ */
+ ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 512 * 1024,
+ 128 * 1024 * 1024 - 512 * 1024, 1);
+ if (ret) {
+ test_msg("Couldn't add bitmap entry %d\n", ret);
+ return ret;
+ }
+
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Now make only the first 256Kb of the bitmap marked as free, so that
+ * we end up with only the following ranges marked as free space:
+ *
+ * [128Mb - 256Kb, 128Mb - 128Kb[
+ * [128Mb + 512Kb, 128Mb + 768Kb[
+ */
+ ret = btrfs_remove_free_space(cache,
+ 128 * 1024 * 1024 + 768 * 1024,
+ 128 * 1024 * 1024 - 768 * 1024);
+ if (ret) {
+ test_msg("Failed to free part of bitmap space %d\n", ret);
+ return ret;
+ }
+
+ /* Confirm that only those 2 ranges are marked as free. */
+ if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
+ 128 * 1024)) {
+ test_msg("Free space range missing\n");
+ return -ENOENT;
+ }
+ if (!test_check_exists(cache, 128 * 1024 * 1024 + 512 * 1024,
+ 256 * 1024)) {
+ test_msg("Free space range missing\n");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that the bitmap range [128Mb + 768Kb, 256Mb[ isn't marked
+ * as free anymore.
+ */
+ if (test_check_exists(cache, 128 * 1024 * 1024 + 768 * 1024,
+ 128 * 1024 * 1024 - 768 * 1024)) {
+ test_msg("Bitmap region not removed from space cache\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Confirm that the region [128Mb + 256Kb, 128Mb + 512Kb[, which is
+ * covered by the bitmap, isn't marked as free.
+ */
+ if (test_check_exists(cache, 128 * 1024 * 1024 + 256 * 1024,
+ 256 * 1024)) {
+ test_msg("Invalid bitmap region marked as free\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Confirm that the region [128Mb, 128Mb + 256Kb[, which is covered
+ * by the bitmap too, isn't marked as free either.
+ */
+ if (test_check_exists(cache, 128 * 1024 * 1024,
+ 256 * 1024)) {
+ test_msg("Invalid bitmap region marked as free\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Now lets mark the region [128Mb, 128Mb + 512Kb[ as free too. But,
+ * lets make sure the free space cache marks it as free in the bitmap,
+ * and doesn't insert a new extent entry to represent this region.
+ */
+ ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 512 * 1024);
+ if (ret) {
+ test_msg("Error adding free space: %d\n", ret);
+ return ret;
+ }
+ /* Confirm the region is marked as free. */
+ if (!test_check_exists(cache, 128 * 1024 * 1024, 512 * 1024)) {
+ test_msg("Bitmap region not marked as free\n");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that no new extent entries or bitmap entries were added to
+ * the cache after adding that free space region.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Now lets add a small free space region to the right of the previous
+ * one, which is not contiguous with it and is part of the bitmap too.
+ * The goal is to test that the bitmap entry space stealing doesn't
+ * steal this space region.
+ */
+ ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 + 16 * 1024 * 1024,
+ 4096);
+ if (ret) {
+ test_msg("Error adding free space: %d\n", ret);
+ return ret;
+ }
+
+ /*
+ * Confirm that no new extent entries or bitmap entries were added to
+ * the cache after adding that free space region.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Now mark the region [128Mb - 128Kb, 128Mb[ as free too. This will
+ * expand the range covered by the existing extent entry that represents
+ * the free space [128Mb - 256Kb, 128Mb - 128Kb[.
+ */
+ ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 128 * 1024,
+ 128 * 1024);
+ if (ret) {
+ test_msg("Error adding free space: %d\n", ret);
+ return ret;
+ }
+ /* Confirm the region is marked as free. */
+ if (!test_check_exists(cache, 128 * 1024 * 1024 - 128 * 1024,
+ 128 * 1024)) {
+ test_msg("Extent region not marked as free\n");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that our extent entry didn't stole all free space from the
+ * bitmap, because of the small 4Kb free space region.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * So now we have the range [128Mb - 256Kb, 128Mb + 768Kb[ as free
+ * space. Without stealing bitmap free space into extent entry space,
+ * we would have all this free space represented by 2 entries in the
+ * cache:
+ *
+ * extent entry covering range: [128Mb - 256Kb, 128Mb[
+ * bitmap entry covering range: [128Mb, 128Mb + 768Kb[
+ *
+ * Attempting to allocate the whole free space (1Mb) would fail, because
+ * we can't allocate from multiple entries.
+ * With the bitmap free space stealing, we get a single extent entry
+ * that represents the 1Mb free space, and therefore we're able to
+ * allocate the whole free space at once.
+ */
+ if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
+ 1 * 1024 * 1024)) {
+ test_msg("Expected region not marked as free\n");
+ return -ENOENT;
+ }
+
+ if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 4096)) {
+ test_msg("Cache free space is not 1Mb + 4Kb\n");
+ return -EINVAL;
+ }
+
+ offset = btrfs_find_space_for_alloc(cache,
+ 0, 1 * 1024 * 1024, 0,
+ &max_extent_size);
+ if (offset != (128 * 1024 * 1024 - 256 * 1024)) {
+ test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
+ offset);
+ return -EINVAL;
+ }
+
+ /* All that remains is a 4Kb free space region in a bitmap. Confirm. */
+ ret = check_num_extents_and_bitmaps(cache, 1, 1);
+ if (ret)
+ return ret;
+
+ if (cache->free_space_ctl->free_space != 4096) {
+ test_msg("Cache free space is not 4Kb\n");
+ return -EINVAL;
+ }
+
+ offset = btrfs_find_space_for_alloc(cache,
+ 0, 4096, 0,
+ &max_extent_size);
+ if (offset != (128 * 1024 * 1024 + 16 * 1024 * 1024)) {
+ test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n",
+ offset);
+ return -EINVAL;
+ }
+
+ ret = check_cache_empty(cache);
+ if (ret)
+ return ret;
+
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+ /*
+ * Now test a similar scenario, but where our extent entry is located
+ * to the right of the bitmap entry, so that we can check that stealing
+ * space from a bitmap to the front of an extent entry works.
+ */
+
+ /*
+ * Extent entry covering free space range [128Mb + 128Kb, 128Mb + 256Kb[
+ */
+ ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 128 * 1024,
+ 128 * 1024, 0);
+ if (ret) {
+ test_msg("Couldn't add extent entry %d\n", ret);
+ return ret;
+ }
+
+ /* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */
+ ret = test_add_free_space_entry(cache, 0,
+ 128 * 1024 * 1024 - 512 * 1024, 1);
+ if (ret) {
+ test_msg("Couldn't add bitmap entry %d\n", ret);
+ return ret;
+ }
+
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Now make only the last 256Kb of the bitmap marked as free, so that
+ * we end up with only the following ranges marked as free space:
+ *
+ * [128Mb + 128b, 128Mb + 256Kb[
+ * [128Mb - 768Kb, 128Mb - 512Kb[
+ */
+ ret = btrfs_remove_free_space(cache,
+ 0,
+ 128 * 1024 * 1024 - 768 * 1024);
+ if (ret) {
+ test_msg("Failed to free part of bitmap space %d\n", ret);
+ return ret;
+ }
+
+ /* Confirm that only those 2 ranges are marked as free. */
+ if (!test_check_exists(cache, 128 * 1024 * 1024 + 128 * 1024,
+ 128 * 1024)) {
+ test_msg("Free space range missing\n");
+ return -ENOENT;
+ }
+ if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
+ 256 * 1024)) {
+ test_msg("Free space range missing\n");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that the bitmap range [0, 128Mb - 768Kb[ isn't marked
+ * as free anymore.
+ */
+ if (test_check_exists(cache, 0,
+ 128 * 1024 * 1024 - 768 * 1024)) {
+ test_msg("Bitmap region not removed from space cache\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Confirm that the region [128Mb - 512Kb, 128Mb[, which is
+ * covered by the bitmap, isn't marked as free.
+ */
+ if (test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
+ 512 * 1024)) {
+ test_msg("Invalid bitmap region marked as free\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Now lets mark the region [128Mb - 512Kb, 128Mb[ as free too. But,
+ * lets make sure the free space cache marks it as free in the bitmap,
+ * and doesn't insert a new extent entry to represent this region.
+ */
+ ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 512 * 1024,
+ 512 * 1024);
+ if (ret) {
+ test_msg("Error adding free space: %d\n", ret);
+ return ret;
+ }
+ /* Confirm the region is marked as free. */
+ if (!test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
+ 512 * 1024)) {
+ test_msg("Bitmap region not marked as free\n");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that no new extent entries or bitmap entries were added to
+ * the cache after adding that free space region.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Now lets add a small free space region to the left of the previous
+ * one, which is not contiguous with it and is part of the bitmap too.
+ * The goal is to test that the bitmap entry space stealing doesn't
+ * steal this space region.
+ */
+ ret = btrfs_add_free_space(cache, 32 * 1024 * 1024, 8192);
+ if (ret) {
+ test_msg("Error adding free space: %d\n", ret);
+ return ret;
+ }
+
+ /*
+ * Now mark the region [128Mb, 128Mb + 128Kb[ as free too. This will
+ * expand the range covered by the existing extent entry that represents
+ * the free space [128Mb + 128Kb, 128Mb + 256Kb[.
+ */
+ ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 128 * 1024);
+ if (ret) {
+ test_msg("Error adding free space: %d\n", ret);
+ return ret;
+ }
+ /* Confirm the region is marked as free. */
+ if (!test_check_exists(cache, 128 * 1024 * 1024, 128 * 1024)) {
+ test_msg("Extent region not marked as free\n");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that our extent entry didn't stole all free space from the
+ * bitmap, because of the small 8Kb free space region.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * So now we have the range [128Mb - 768Kb, 128Mb + 256Kb[ as free
+ * space. Without stealing bitmap free space into extent entry space,
+ * we would have all this free space represented by 2 entries in the
+ * cache:
+ *
+ * extent entry covering range: [128Mb, 128Mb + 256Kb[
+ * bitmap entry covering range: [128Mb - 768Kb, 128Mb[
+ *
+ * Attempting to allocate the whole free space (1Mb) would fail, because
+ * we can't allocate from multiple entries.
+ * With the bitmap free space stealing, we get a single extent entry
+ * that represents the 1Mb free space, and therefore we're able to
+ * allocate the whole free space at once.
+ */
+ if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
+ 1 * 1024 * 1024)) {
+ test_msg("Expected region not marked as free\n");
+ return -ENOENT;
+ }
+
+ if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 8192)) {
+ test_msg("Cache free space is not 1Mb + 8Kb\n");
+ return -EINVAL;
+ }
+
+ offset = btrfs_find_space_for_alloc(cache,
+ 0, 1 * 1024 * 1024, 0,
+ &max_extent_size);
+ if (offset != (128 * 1024 * 1024 - 768 * 1024)) {
+ test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
+ offset);
+ return -EINVAL;
+ }
+
+ /* All that remains is a 8Kb free space region in a bitmap. Confirm. */
+ ret = check_num_extents_and_bitmaps(cache, 1, 1);
+ if (ret)
+ return ret;
+
+ if (cache->free_space_ctl->free_space != 8192) {
+ test_msg("Cache free space is not 8Kb\n");
+ return -EINVAL;
+ }
+
+ offset = btrfs_find_space_for_alloc(cache,
+ 0, 8192, 0,
+ &max_extent_size);
+ if (offset != (32 * 1024 * 1024)) {
+ test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n",
+ offset);
+ return -EINVAL;
+ }
+
+ ret = check_cache_empty(cache);
+ if (ret)
+ return ret;
+
+ cache->free_space_ctl->op->use_bitmap = use_bitmap_op;
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+ return 0;
+}
+
int btrfs_test_free_space_cache(void)
{
struct btrfs_block_group_cache *cache;
@@ -386,6 +898,8 @@ int btrfs_test_free_space_cache(void)
ret = test_bitmaps_and_extents(cache);
if (ret)
goto out;
+
+ ret = test_steal_space_from_bitmap_to_extent(cache);
out:
__btrfs_remove_free_space_cache(cache->free_space_ctl);
kfree(cache->free_space_ctl);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index d89c6d3542ca..dcaae3616728 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -386,7 +386,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
int ret;
/* Send isn't supposed to start transactions. */
- ASSERT(current->journal_info != (void *)BTRFS_SEND_TRANS_STUB);
+ ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB);
if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
return ERR_PTR(-EROFS);
@@ -408,7 +408,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
if (num_items > 0 && root != root->fs_info->chunk_root) {
if (root->fs_info->quota_enabled &&
is_fstree(root->root_key.objectid)) {
- qgroup_reserved = num_items * root->leafsize;
+ qgroup_reserved = num_items * root->nodesize;
ret = btrfs_qgroup_reserve(root, qgroup_reserved);
if (ret)
return ERR_PTR(ret);
@@ -418,7 +418,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
/*
* Do the reservation for the relocation root creation
*/
- if (unlikely(need_reserve_reloc_root(root))) {
+ if (need_reserve_reloc_root(root)) {
num_bytes += root->nodesize;
reloc_reserved = true;
}
@@ -609,7 +609,6 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
if (transid <= root->fs_info->last_trans_committed)
goto out;
- ret = -EINVAL;
/* find specified transaction */
spin_lock(&root->fs_info->trans_lock);
list_for_each_entry(t, &root->fs_info->trans_list, list) {
@@ -625,9 +624,16 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
}
}
spin_unlock(&root->fs_info->trans_lock);
- /* The specified transaction doesn't exist */
- if (!cur_trans)
+
+ /*
+ * The specified transaction doesn't exist, or we
+ * raced with btrfs_commit_transaction
+ */
+ if (!cur_trans) {
+ if (transid > root->fs_info->last_trans_committed)
+ ret = -EINVAL;
goto out;
+ }
} else {
/* find newest transaction that is committing | committed */
spin_lock(&root->fs_info->trans_lock);
@@ -851,6 +857,8 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
struct extent_state *cached_state = NULL;
u64 start = 0;
u64 end;
+ struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode);
+ bool errors = false;
while (!find_first_extent_bit(dirty_pages, start, &start, &end,
EXTENT_NEED_WAIT, &cached_state)) {
@@ -864,6 +872,26 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
}
if (err)
werr = err;
+
+ if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ if ((mark & EXTENT_DIRTY) &&
+ test_and_clear_bit(BTRFS_INODE_BTREE_LOG1_ERR,
+ &btree_ino->runtime_flags))
+ errors = true;
+
+ if ((mark & EXTENT_NEW) &&
+ test_and_clear_bit(BTRFS_INODE_BTREE_LOG2_ERR,
+ &btree_ino->runtime_flags))
+ errors = true;
+ } else {
+ if (test_and_clear_bit(BTRFS_INODE_BTREE_ERR,
+ &btree_ino->runtime_flags))
+ errors = true;
+ }
+
+ if (errors && !werr)
+ werr = -EIO;
+
return werr;
}
@@ -1629,6 +1657,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
{
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_transaction *prev_trans = NULL;
+ struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode);
int ret;
/* Stop the commit early if ->aborted is set */
@@ -1868,6 +1897,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
sizeof(*root->fs_info->super_copy));
+ btrfs_update_commit_device_size(root->fs_info);
+ btrfs_update_commit_device_bytes_used(root, cur_trans);
+
+ clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
+ clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
+
spin_lock(&root->fs_info->trans_lock);
cur_trans->state = TRANS_STATE_UNBLOCKED;
root->fs_info->running_transaction = NULL;
@@ -1981,9 +2016,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
ret = btrfs_drop_snapshot(root, NULL, 0, 0);
else
ret = btrfs_drop_snapshot(root, NULL, 1, 0);
- /*
- * If we encounter a transaction abort during snapshot cleaning, we
- * don't want to crash here
- */
+
return (ret < 0) ? 0 : 1;
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 579be51b27e5..d8f40e1a5d2d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -79,7 +79,7 @@ struct btrfs_transaction {
#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \
__TRANS_ATTACH)
-#define BTRFS_SEND_TRANS_STUB 1
+#define BTRFS_SEND_TRANS_STUB ((void *)1)
struct btrfs_trans_handle {
u64 transid;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1d1ba083ca6e..1475979e5718 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -97,7 +97,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
int inode_only,
const loff_t start,
- const loff_t end);
+ const loff_t end,
+ struct btrfs_log_ctx *ctx);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid);
@@ -1498,7 +1499,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
return -EIO;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
- btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = objectid;
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
@@ -1637,6 +1638,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
found_key.type == log_key.type &&
found_key.offset == log_key.offset &&
btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
+ update_size = false;
goto out;
}
@@ -2157,7 +2159,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
- blocksize = btrfs_level_size(root, *level - 1);
+ blocksize = root->nodesize;
parent = path->nodes[*level];
root_owner = btrfs_header_owner(parent);
@@ -2983,8 +2985,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
min_key.type = key_type;
min_key.offset = min_offset;
- path->keep_locks = 1;
-
ret = btrfs_search_forward(root, &min_key, path, trans->transid);
/*
@@ -3364,7 +3364,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
* or deletes of this inode don't have to relog the inode
* again
*/
- if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY &&
+ if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
!skip_csum) {
int found_type;
extent = btrfs_item_ptr(src, start_slot + i,
@@ -3573,107 +3573,33 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
return 0;
}
-static int log_one_extent(struct btrfs_trans_handle *trans,
- struct inode *inode, struct btrfs_root *root,
- struct extent_map *em, struct btrfs_path *path,
- struct list_head *logged_list)
+static int wait_ordered_extents(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct btrfs_root *root,
+ const struct extent_map *em,
+ const struct list_head *logged_list,
+ bool *ordered_io_error)
{
- struct btrfs_root *log = root->log_root;
- struct btrfs_file_extent_item *fi;
- struct extent_buffer *leaf;
struct btrfs_ordered_extent *ordered;
- struct list_head ordered_sums;
- struct btrfs_map_token token;
- struct btrfs_key key;
+ struct btrfs_root *log = root->log_root;
u64 mod_start = em->mod_start;
u64 mod_len = em->mod_len;
+ const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
u64 csum_offset;
u64 csum_len;
- u64 extent_offset = em->start - em->orig_start;
- u64 block_len;
- int ret;
- bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
- int extent_inserted = 0;
-
- INIT_LIST_HEAD(&ordered_sums);
- btrfs_init_map_token(&token);
-
- ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
- em->start + em->len, NULL, 0, 1,
- sizeof(*fi), &extent_inserted);
- if (ret)
- return ret;
-
- if (!extent_inserted) {
- key.objectid = btrfs_ino(inode);
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = em->start;
-
- ret = btrfs_insert_empty_item(trans, log, path, &key,
- sizeof(*fi));
- if (ret)
- return ret;
- }
- leaf = path->nodes[0];
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
-
- btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
- &token);
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
- skip_csum = true;
- btrfs_set_token_file_extent_type(leaf, fi,
- BTRFS_FILE_EXTENT_PREALLOC,
- &token);
- } else {
- btrfs_set_token_file_extent_type(leaf, fi,
- BTRFS_FILE_EXTENT_REG,
- &token);
- if (em->block_start == EXTENT_MAP_HOLE)
- skip_csum = true;
- }
-
- block_len = max(em->block_len, em->orig_block_len);
- if (em->compress_type != BTRFS_COMPRESS_NONE) {
- btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
- em->block_start,
- &token);
- btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
- &token);
- } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
- em->block_start -
- extent_offset, &token);
- btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
- &token);
- } else {
- btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
- btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
- &token);
- }
-
- btrfs_set_token_file_extent_offset(leaf, fi,
- em->start - em->orig_start,
- &token);
- btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
- btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
- btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
- &token);
- btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
- btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
- btrfs_mark_buffer_dirty(leaf);
+ LIST_HEAD(ordered_sums);
+ int ret = 0;
- btrfs_release_path(path);
- if (ret) {
- return ret;
- }
+ *ordered_io_error = false;
- if (skip_csum)
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ em->block_start == EXTENT_MAP_HOLE)
return 0;
/*
- * First check and see if our csums are on our outstanding ordered
- * extents.
+ * Wait far any ordered extent that covers our extent map. If it
+ * finishes without an error, first check and see if our csums are on
+ * our outstanding ordered extents.
*/
list_for_each_entry(ordered, logged_list, log_list) {
struct btrfs_ordered_sum *sum;
@@ -3685,6 +3611,24 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
mod_start + mod_len <= ordered->file_offset)
continue;
+ if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
+ !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
+ !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
+ const u64 start = ordered->file_offset;
+ const u64 end = ordered->file_offset + ordered->len - 1;
+
+ WARN_ON(ordered->inode != inode);
+ filemap_fdatawrite_range(inode->i_mapping, start, end);
+ }
+
+ wait_event(ordered->wait,
+ (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) ||
+ test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
+
+ if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
+ *ordered_io_error = true;
+ break;
+ }
/*
* We are going to copy all the csums on this ordered extent, so
* go ahead and adjust mod_start and mod_len in case this
@@ -3716,6 +3660,9 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
}
}
+ if (skip_csum)
+ continue;
+
/*
* To keep us from looping for the above case of an ordered
* extent that falls inside of the logged extent.
@@ -3733,18 +3680,16 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
list_for_each_entry(sum, &ordered->list, list) {
ret = btrfs_csum_file_blocks(trans, log, sum);
if (ret)
- goto unlocked;
+ break;
}
-
}
-unlocked:
- if (!mod_len || ret)
+ if (*ordered_io_error || !mod_len || ret || skip_csum)
return ret;
if (em->compress_type) {
csum_offset = 0;
- csum_len = block_len;
+ csum_len = max(em->block_len, em->orig_block_len);
} else {
csum_offset = mod_start - em->start;
csum_len = mod_len;
@@ -3771,11 +3716,106 @@ unlocked:
return ret;
}
+static int log_one_extent(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct btrfs_root *root,
+ const struct extent_map *em,
+ struct btrfs_path *path,
+ const struct list_head *logged_list,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *log = root->log_root;
+ struct btrfs_file_extent_item *fi;
+ struct extent_buffer *leaf;
+ struct btrfs_map_token token;
+ struct btrfs_key key;
+ u64 extent_offset = em->start - em->orig_start;
+ u64 block_len;
+ int ret;
+ int extent_inserted = 0;
+ bool ordered_io_err = false;
+
+ ret = wait_ordered_extents(trans, inode, root, em, logged_list,
+ &ordered_io_err);
+ if (ret)
+ return ret;
+
+ if (ordered_io_err) {
+ ctx->io_err = -EIO;
+ return 0;
+ }
+
+ btrfs_init_map_token(&token);
+
+ ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
+ em->start + em->len, NULL, 0, 1,
+ sizeof(*fi), &extent_inserted);
+ if (ret)
+ return ret;
+
+ if (!extent_inserted) {
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = em->start;
+
+ ret = btrfs_insert_empty_item(trans, log, path, &key,
+ sizeof(*fi));
+ if (ret)
+ return ret;
+ }
+ leaf = path->nodes[0];
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
+ &token);
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ btrfs_set_token_file_extent_type(leaf, fi,
+ BTRFS_FILE_EXTENT_PREALLOC,
+ &token);
+ else
+ btrfs_set_token_file_extent_type(leaf, fi,
+ BTRFS_FILE_EXTENT_REG,
+ &token);
+
+ block_len = max(em->block_len, em->orig_block_len);
+ if (em->compress_type != BTRFS_COMPRESS_NONE) {
+ btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+ em->block_start,
+ &token);
+ btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
+ &token);
+ } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+ btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+ em->block_start -
+ extent_offset, &token);
+ btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
+ &token);
+ } else {
+ btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
+ btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
+ &token);
+ }
+
+ btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
+ btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
+ btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
+ btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
+ &token);
+ btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
+ btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
+ btrfs_mark_buffer_dirty(leaf);
+
+ btrfs_release_path(path);
+
+ return ret;
+}
+
static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode,
struct btrfs_path *path,
- struct list_head *logged_list)
+ struct list_head *logged_list,
+ struct btrfs_log_ctx *ctx)
{
struct extent_map *em, *n;
struct list_head extents;
@@ -3833,7 +3873,8 @@ process:
write_unlock(&tree->lock);
- ret = log_one_extent(trans, inode, root, em, path, logged_list);
+ ret = log_one_extent(trans, inode, root, em, path, logged_list,
+ ctx);
write_lock(&tree->lock);
clear_em_logging(tree, em);
free_extent_map(em);
@@ -3863,7 +3904,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
int inode_only,
const loff_t start,
- const loff_t end)
+ const loff_t end,
+ struct btrfs_log_ctx *ctx)
{
struct btrfs_path *path;
struct btrfs_path *dst_path;
@@ -3964,7 +4006,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
err = ret;
goto out_unlock;
}
- path->keep_locks = 1;
while (1) {
ins_nr = 0;
@@ -3994,7 +4035,8 @@ again:
if (ret < 0) {
err = ret;
goto out_unlock;
- } if (ret) {
+ }
+ if (ret) {
ins_nr = 0;
btrfs_release_path(path);
continue;
@@ -4048,7 +4090,7 @@ log_extents:
btrfs_release_path(dst_path);
if (fast_search) {
ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
- &logged_list);
+ &logged_list, ctx);
if (ret) {
err = ret;
goto out_unlock;
@@ -4238,7 +4280,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (ret)
goto end_no_trans;
- ret = btrfs_log_inode(trans, root, inode, inode_only, start, end);
+ ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
if (ret)
goto end_trans;
@@ -4267,7 +4309,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (BTRFS_I(inode)->generation >
root->fs_info->last_trans_committed) {
ret = btrfs_log_inode(trans, root, inode, inode_only,
- 0, LLONG_MAX);
+ 0, LLONG_MAX, ctx);
if (ret)
goto end_trans;
}
@@ -4359,7 +4401,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
again:
key.objectid = BTRFS_TREE_LOG_OBJECTID;
key.offset = (u64)-1;
- btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ key.type = BTRFS_ROOT_ITEM_KEY;
while (1) {
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index e2e798ae7cd7..154990c26dcb 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -28,6 +28,7 @@
struct btrfs_log_ctx {
int log_ret;
int log_transid;
+ int io_err;
struct list_head list;
};
@@ -35,6 +36,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
{
ctx->log_ret = 0;
ctx->log_transid = 0;
+ ctx->io_err = 0;
INIT_LIST_HEAD(&ctx->list);
}
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index f6a4c03ee7d8..778282944530 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -279,7 +279,6 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
key.offset = 0;
again_search_slot:
- path->keep_locks = 1;
ret = btrfs_search_forward(root, &key, path, 0);
if (ret) {
if (ret > 0)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2c2d6d1d8eee..d47289c715c8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -50,7 +50,7 @@ static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
-static DEFINE_MUTEX(uuid_mutex);
+DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
static void lock_chunks(struct btrfs_root *root)
@@ -74,6 +74,7 @@ static struct btrfs_fs_devices *__alloc_fs_devices(void)
mutex_init(&fs_devs->device_list_mutex);
INIT_LIST_HEAD(&fs_devs->devices);
+ INIT_LIST_HEAD(&fs_devs->resized_devices);
INIT_LIST_HEAD(&fs_devs->alloc_list);
INIT_LIST_HEAD(&fs_devs->list);
@@ -154,11 +155,13 @@ static struct btrfs_device *__alloc_device(void)
INIT_LIST_HEAD(&dev->dev_list);
INIT_LIST_HEAD(&dev->dev_alloc_list);
+ INIT_LIST_HEAD(&dev->resized_list);
spin_lock_init(&dev->io_lock);
spin_lock_init(&dev->reada_lock);
atomic_set(&dev->reada_in_flight, 0);
+ atomic_set(&dev->dev_stats_ccnt, 0);
INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
@@ -474,14 +477,13 @@ static noinline int device_list_add(const char *path,
return PTR_ERR(fs_devices);
list_add(&fs_devices->list, &fs_uuids);
- fs_devices->latest_devid = devid;
- fs_devices->latest_trans = found_transid;
device = NULL;
} else {
device = __find_device(&fs_devices->devices, devid,
disk_super->dev_item.uuid);
}
+
if (!device) {
if (fs_devices->opened)
return -EBUSY;
@@ -565,10 +567,6 @@ static noinline int device_list_add(const char *path,
if (!fs_devices->opened)
device->generation = found_transid;
- if (found_transid > fs_devices->latest_trans) {
- fs_devices->latest_devid = devid;
- fs_devices->latest_trans = found_transid;
- }
*fs_devices_ret = fs_devices;
return ret;
@@ -584,8 +582,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
if (IS_ERR(fs_devices))
return fs_devices;
- fs_devices->latest_devid = orig->latest_devid;
- fs_devices->latest_trans = orig->latest_trans;
+ mutex_lock(&orig->device_list_mutex);
fs_devices->total_devices = orig->total_devices;
/* We have held the volume lock, it is safe to get the devices. */
@@ -614,8 +611,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
device->fs_devices = fs_devices;
fs_devices->num_devices++;
}
+ mutex_unlock(&orig->device_list_mutex);
return fs_devices;
error:
+ mutex_unlock(&orig->device_list_mutex);
free_fs_devices(fs_devices);
return ERR_PTR(-ENOMEM);
}
@@ -624,10 +623,7 @@ void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices, int step)
{
struct btrfs_device *device, *next;
-
- struct block_device *latest_bdev = NULL;
- u64 latest_devid = 0;
- u64 latest_transid = 0;
+ struct btrfs_device *latest_dev = NULL;
mutex_lock(&uuid_mutex);
again:
@@ -635,11 +631,9 @@ again:
list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
if (device->in_fs_metadata) {
if (!device->is_tgtdev_for_dev_replace &&
- (!latest_transid ||
- device->generation > latest_transid)) {
- latest_devid = device->devid;
- latest_transid = device->generation;
- latest_bdev = device->bdev;
+ (!latest_dev ||
+ device->generation > latest_dev->generation)) {
+ latest_dev = device;
}
continue;
}
@@ -681,9 +675,7 @@ again:
goto again;
}
- fs_devices->latest_bdev = latest_bdev;
- fs_devices->latest_devid = latest_devid;
- fs_devices->latest_trans = latest_transid;
+ fs_devices->latest_bdev = latest_dev->bdev;
mutex_unlock(&uuid_mutex);
}
@@ -732,8 +724,6 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
fs_devices->rw_devices--;
}
- if (device->can_discard)
- fs_devices->num_can_discard--;
if (device->missing)
fs_devices->missing_devices--;
@@ -798,11 +788,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
struct block_device *bdev;
struct list_head *head = &fs_devices->devices;
struct btrfs_device *device;
- struct block_device *latest_bdev = NULL;
+ struct btrfs_device *latest_dev = NULL;
struct buffer_head *bh;
struct btrfs_super_block *disk_super;
- u64 latest_devid = 0;
- u64 latest_transid = 0;
u64 devid;
int seeding = 1;
int ret = 0;
@@ -830,11 +818,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
goto error_brelse;
device->generation = btrfs_super_generation(disk_super);
- if (!latest_transid || device->generation > latest_transid) {
- latest_devid = devid;
- latest_transid = device->generation;
- latest_bdev = bdev;
- }
+ if (!latest_dev ||
+ device->generation > latest_dev->generation)
+ latest_dev = device;
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
device->writeable = 0;
@@ -844,10 +830,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
}
q = bdev_get_queue(bdev);
- if (blk_queue_discard(q)) {
+ if (blk_queue_discard(q))
device->can_discard = 1;
- fs_devices->num_can_discard++;
- }
device->bdev = bdev;
device->in_fs_metadata = 0;
@@ -877,9 +861,7 @@ error_brelse:
}
fs_devices->seeding = seeding;
fs_devices->opened = 1;
- fs_devices->latest_bdev = latest_bdev;
- fs_devices->latest_devid = latest_devid;
- fs_devices->latest_trans = latest_transid;
+ fs_devices->latest_bdev = latest_dev->bdev;
fs_devices->total_rw_bytes = 0;
out:
return ret;
@@ -1053,7 +1035,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
if (key.objectid > device->devid)
break;
- if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+ if (key.type != BTRFS_DEV_EXTENT_KEY)
goto next;
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
@@ -1205,7 +1187,7 @@ again:
if (key.objectid > device->devid)
break;
- if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+ if (key.type != BTRFS_DEV_EXTENT_KEY)
goto next;
if (key.offset > search_start) {
@@ -1284,7 +1266,7 @@ out:
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device,
- u64 start)
+ u64 start, u64 *dev_extent_len)
{
int ret;
struct btrfs_path *path;
@@ -1326,13 +1308,8 @@ again:
goto out;
}
- if (device->bytes_used > 0) {
- u64 len = btrfs_dev_extent_length(leaf, extent);
- device->bytes_used -= len;
- spin_lock(&root->fs_info->free_chunk_lock);
- root->fs_info->free_chunk_space += len;
- spin_unlock(&root->fs_info->free_chunk_lock);
- }
+ *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
+
ret = btrfs_del_item(trans, root, path);
if (ret) {
btrfs_error(root->fs_info, ret,
@@ -1482,8 +1459,10 @@ static int btrfs_add_device(struct btrfs_trans_handle *trans,
btrfs_set_device_io_align(leaf, dev_item, device->io_align);
btrfs_set_device_io_width(leaf, dev_item, device->io_width);
btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
- btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
- btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+ btrfs_set_device_total_bytes(leaf, dev_item,
+ btrfs_device_get_disk_total_bytes(device));
+ btrfs_set_device_bytes_used(leaf, dev_item,
+ btrfs_device_get_bytes_used(device));
btrfs_set_device_group(leaf, dev_item, 0);
btrfs_set_device_seek_speed(leaf, dev_item, 0);
btrfs_set_device_bandwidth(leaf, dev_item, 0);
@@ -1539,7 +1518,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
- lock_chunks(root);
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
@@ -1555,7 +1533,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
goto out;
out:
btrfs_free_path(path);
- unlock_chunks(root);
btrfs_commit_transaction(trans, root);
return ret;
}
@@ -1671,8 +1648,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
if (device->writeable) {
lock_chunks(root);
list_del_init(&device->dev_alloc_list);
+ device->fs_devices->rw_devices--;
unlock_chunks(root);
- root->fs_info->fs_devices->rw_devices--;
clear_super = true;
}
@@ -1691,11 +1668,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
if (ret)
goto error_undo;
- spin_lock(&root->fs_info->free_chunk_lock);
- root->fs_info->free_chunk_space = device->total_bytes -
- device->bytes_used;
- spin_unlock(&root->fs_info->free_chunk_lock);
-
device->in_fs_metadata = 0;
btrfs_scrub_cancel_dev(root->fs_info, device);
@@ -1749,9 +1721,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
fs_devices = fs_devices->seed;
}
cur_devices->seed = NULL;
- lock_chunks(root);
__btrfs_close_devices(cur_devices);
- unlock_chunks(root);
free_fs_devices(cur_devices);
}
@@ -1824,8 +1794,8 @@ error_undo:
lock_chunks(root);
list_add(&device->dev_alloc_list,
&root->fs_info->fs_devices->alloc_list);
+ device->fs_devices->rw_devices++;
unlock_chunks(root);
- root->fs_info->fs_devices->rw_devices++;
}
goto error_brelse;
}
@@ -1833,29 +1803,57 @@ error_undo:
void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *srcdev)
{
+ struct btrfs_fs_devices *fs_devices;
+
WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
+ /*
+ * in case of fs with no seed, srcdev->fs_devices will point
+ * to fs_devices of fs_info. However when the dev being replaced is
+ * a seed dev it will point to the seed's local fs_devices. In short
+ * srcdev will have its correct fs_devices in both the cases.
+ */
+ fs_devices = srcdev->fs_devices;
+
list_del_rcu(&srcdev->dev_list);
list_del_rcu(&srcdev->dev_alloc_list);
- fs_info->fs_devices->num_devices--;
- if (srcdev->missing) {
- fs_info->fs_devices->missing_devices--;
- fs_info->fs_devices->rw_devices++;
- }
- if (srcdev->can_discard)
- fs_info->fs_devices->num_can_discard--;
- if (srcdev->bdev) {
- fs_info->fs_devices->open_devices--;
+ fs_devices->num_devices--;
+ if (srcdev->missing)
+ fs_devices->missing_devices--;
- /*
- * zero out the old super if it is not writable
- * (e.g. seed device)
- */
- if (srcdev->writeable)
- btrfs_scratch_superblock(srcdev);
+ if (srcdev->writeable) {
+ fs_devices->rw_devices--;
+ /* zero out the old super if it is writable */
+ btrfs_scratch_superblock(srcdev);
}
+ if (srcdev->bdev)
+ fs_devices->open_devices--;
+
call_rcu(&srcdev->rcu, free_device);
+
+ /*
+ * unless fs_devices is seed fs, num_devices shouldn't go
+ * zero
+ */
+ BUG_ON(!fs_devices->num_devices && !fs_devices->seeding);
+
+ /* if this is no devs we rather delete the fs_devices */
+ if (!fs_devices->num_devices) {
+ struct btrfs_fs_devices *tmp_fs_devices;
+
+ tmp_fs_devices = fs_info->fs_devices;
+ while (tmp_fs_devices) {
+ if (tmp_fs_devices->seed == fs_devices) {
+ tmp_fs_devices->seed = fs_devices->seed;
+ break;
+ }
+ tmp_fs_devices = tmp_fs_devices->seed;
+ }
+ fs_devices->seed = NULL;
+ __btrfs_close_devices(fs_devices);
+ free_fs_devices(fs_devices);
+ }
}
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
@@ -1863,6 +1861,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
{
struct btrfs_device *next_device;
+ mutex_lock(&uuid_mutex);
WARN_ON(!tgtdev);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
if (tgtdev->bdev) {
@@ -1870,8 +1869,6 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
fs_info->fs_devices->open_devices--;
}
fs_info->fs_devices->num_devices--;
- if (tgtdev->can_discard)
- fs_info->fs_devices->num_can_discard++;
next_device = list_entry(fs_info->fs_devices->devices.next,
struct btrfs_device, dev_list);
@@ -1884,6 +1881,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
call_rcu(&tgtdev->rcu, free_device);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&uuid_mutex);
}
static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
@@ -1982,17 +1980,17 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
synchronize_rcu);
+ list_for_each_entry(device, &seed_devices->devices, dev_list)
+ device->fs_devices = seed_devices;
+ lock_chunks(root);
list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
- list_for_each_entry(device, &seed_devices->devices, dev_list) {
- device->fs_devices = seed_devices;
- }
+ unlock_chunks(root);
fs_devices->seeding = 0;
fs_devices->num_devices = 0;
fs_devices->open_devices = 0;
fs_devices->missing_devices = 0;
- fs_devices->num_can_discard = 0;
fs_devices->rotating = 0;
fs_devices->seed = seed_devices;
@@ -2092,7 +2090,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
struct list_head *devices;
struct super_block *sb = root->fs_info->sb;
struct rcu_string *name;
- u64 total_bytes;
+ u64 tmp;
int seeding_dev = 0;
int ret = 0;
@@ -2148,8 +2146,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
goto error;
}
- lock_chunks(root);
-
q = bdev_get_queue(bdev);
if (blk_queue_discard(q))
device->can_discard = 1;
@@ -2160,6 +2156,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
device->sector_size = root->sectorsize;
device->total_bytes = i_size_read(bdev->bd_inode);
device->disk_total_bytes = device->total_bytes;
+ device->commit_total_bytes = device->total_bytes;
device->dev_root = root->fs_info->dev_root;
device->bdev = bdev;
device->in_fs_metadata = 1;
@@ -2177,6 +2174,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
device->fs_devices = root->fs_info->fs_devices;
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ lock_chunks(root);
list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
list_add(&device->dev_alloc_list,
&root->fs_info->fs_devices->alloc_list);
@@ -2184,8 +2182,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
root->fs_info->fs_devices->open_devices++;
root->fs_info->fs_devices->rw_devices++;
root->fs_info->fs_devices->total_devices++;
- if (device->can_discard)
- root->fs_info->fs_devices->num_can_discard++;
root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
spin_lock(&root->fs_info->free_chunk_lock);
@@ -2195,26 +2191,45 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
if (!blk_queue_nonrot(bdev_get_queue(bdev)))
root->fs_info->fs_devices->rotating = 1;
- total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
+ tmp = btrfs_super_total_bytes(root->fs_info->super_copy);
btrfs_set_super_total_bytes(root->fs_info->super_copy,
- total_bytes + device->total_bytes);
+ tmp + device->total_bytes);
- total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
+ tmp = btrfs_super_num_devices(root->fs_info->super_copy);
btrfs_set_super_num_devices(root->fs_info->super_copy,
- total_bytes + 1);
+ tmp + 1);
/* add sysfs device entry */
btrfs_kobj_add_device(root->fs_info, device);
+ /*
+ * we've got more storage, clear any full flags on the space
+ * infos
+ */
+ btrfs_clear_space_info_full(root->fs_info);
+
+ unlock_chunks(root);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
if (seeding_dev) {
- char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
+ lock_chunks(root);
ret = init_first_rw_device(trans, root, device);
+ unlock_chunks(root);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto error_trans;
}
+ }
+
+ ret = btrfs_add_device(trans, root, device);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto error_trans;
+ }
+
+ if (seeding_dev) {
+ char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
+
ret = btrfs_finish_sprout(trans, root);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
@@ -2228,21 +2243,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
root->fs_info->fsid);
if (kobject_rename(&root->fs_info->super_kobj, fsid_buf))
goto error_trans;
- } else {
- ret = btrfs_add_device(trans, root, device);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- goto error_trans;
- }
}
- /*
- * we've got more storage, clear any full flags on the space
- * infos
- */
- btrfs_clear_space_info_full(root->fs_info);
-
- unlock_chunks(root);
root->fs_info->num_tolerated_disk_barrier_failures =
btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
ret = btrfs_commit_transaction(trans, root);
@@ -2274,7 +2276,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
return ret;
error_trans:
- unlock_chunks(root);
btrfs_end_transaction(trans, root);
rcu_string_free(device->name);
btrfs_kobj_rm_device(root->fs_info, device);
@@ -2289,6 +2290,7 @@ error:
}
int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+ struct btrfs_device *srcdev,
struct btrfs_device **device_out)
{
struct request_queue *q;
@@ -2301,24 +2303,38 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
int ret = 0;
*device_out = NULL;
- if (fs_info->fs_devices->seeding)
+ if (fs_info->fs_devices->seeding) {
+ btrfs_err(fs_info, "the filesystem is a seed filesystem!");
return -EINVAL;
+ }
bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
fs_info->bdev_holder);
- if (IS_ERR(bdev))
+ if (IS_ERR(bdev)) {
+ btrfs_err(fs_info, "target device %s is invalid!", device_path);
return PTR_ERR(bdev);
+ }
filemap_write_and_wait(bdev->bd_inode->i_mapping);
devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
if (device->bdev == bdev) {
+ btrfs_err(fs_info, "target device is in the filesystem!");
ret = -EEXIST;
goto error;
}
}
+
+ if (i_size_read(bdev->bd_inode) <
+ btrfs_device_get_total_bytes(srcdev)) {
+ btrfs_err(fs_info, "target device is smaller than source device!");
+ ret = -EINVAL;
+ goto error;
+ }
+
+
device = btrfs_alloc_device(NULL, &devid, NULL);
if (IS_ERR(device)) {
ret = PTR_ERR(device);
@@ -2342,8 +2358,12 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
device->io_width = root->sectorsize;
device->io_align = root->sectorsize;
device->sector_size = root->sectorsize;
- device->total_bytes = i_size_read(bdev->bd_inode);
- device->disk_total_bytes = device->total_bytes;
+ device->total_bytes = btrfs_device_get_total_bytes(srcdev);
+ device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
+ device->bytes_used = btrfs_device_get_bytes_used(srcdev);
+ ASSERT(list_empty(&srcdev->resized_list));
+ device->commit_total_bytes = srcdev->commit_total_bytes;
+ device->commit_bytes_used = device->bytes_used;
device->dev_root = fs_info->dev_root;
device->bdev = bdev;
device->in_fs_metadata = 1;
@@ -2355,8 +2375,6 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
list_add(&device->dev_list, &fs_info->fs_devices->devices);
fs_info->fs_devices->num_devices++;
fs_info->fs_devices->open_devices++;
- if (device->can_discard)
- fs_info->fs_devices->num_can_discard++;
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
*device_out = device;
@@ -2415,8 +2433,10 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
btrfs_set_device_io_align(leaf, dev_item, device->io_align);
btrfs_set_device_io_width(leaf, dev_item, device->io_width);
btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
- btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
- btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+ btrfs_set_device_total_bytes(leaf, dev_item,
+ btrfs_device_get_disk_total_bytes(device));
+ btrfs_set_device_bytes_used(leaf, dev_item,
+ btrfs_device_get_bytes_used(device));
btrfs_mark_buffer_dirty(leaf);
out:
@@ -2424,40 +2444,44 @@ out:
return ret;
}
-static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size)
{
struct btrfs_super_block *super_copy =
device->dev_root->fs_info->super_copy;
- u64 old_total = btrfs_super_total_bytes(super_copy);
- u64 diff = new_size - device->total_bytes;
+ struct btrfs_fs_devices *fs_devices;
+ u64 old_total;
+ u64 diff;
if (!device->writeable)
return -EACCES;
+
+ lock_chunks(device->dev_root);
+ old_total = btrfs_super_total_bytes(super_copy);
+ diff = new_size - device->total_bytes;
+
if (new_size <= device->total_bytes ||
- device->is_tgtdev_for_dev_replace)
+ device->is_tgtdev_for_dev_replace) {
+ unlock_chunks(device->dev_root);
return -EINVAL;
+ }
+
+ fs_devices = device->dev_root->fs_info->fs_devices;
btrfs_set_super_total_bytes(super_copy, old_total + diff);
device->fs_devices->total_rw_bytes += diff;
- device->total_bytes = new_size;
- device->disk_total_bytes = new_size;
+ btrfs_device_set_total_bytes(device, new_size);
+ btrfs_device_set_disk_total_bytes(device, new_size);
btrfs_clear_space_info_full(device->dev_root->fs_info);
+ if (list_empty(&device->resized_list))
+ list_add_tail(&device->resized_list,
+ &fs_devices->resized_devices);
+ unlock_chunks(device->dev_root);
return btrfs_update_device(trans, device);
}
-int btrfs_grow_device(struct btrfs_trans_handle *trans,
- struct btrfs_device *device, u64 new_size)
-{
- int ret;
- lock_chunks(device->dev_root);
- ret = __btrfs_grow_device(trans, device, new_size);
- unlock_chunks(device->dev_root);
- return ret;
-}
-
static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 chunk_tree, u64 chunk_objectid,
@@ -2509,6 +2533,7 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
u32 cur;
struct btrfs_key key;
+ lock_chunks(root);
array_size = btrfs_super_sys_array_size(super_copy);
ptr = super_copy->sys_chunk_array;
@@ -2538,79 +2563,95 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
cur += len;
}
}
+ unlock_chunks(root);
return ret;
}
-static int btrfs_relocate_chunk(struct btrfs_root *root,
- u64 chunk_tree, u64 chunk_objectid,
- u64 chunk_offset)
+int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 chunk_offset)
{
struct extent_map_tree *em_tree;
- struct btrfs_root *extent_root;
- struct btrfs_trans_handle *trans;
struct extent_map *em;
+ struct btrfs_root *extent_root = root->fs_info->extent_root;
struct map_lookup *map;
- int ret;
- int i;
+ u64 dev_extent_len = 0;
+ u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ u64 chunk_tree = root->fs_info->chunk_root->objectid;
+ int i, ret = 0;
+ /* Just in case */
root = root->fs_info->chunk_root;
- extent_root = root->fs_info->extent_root;
em_tree = &root->fs_info->mapping_tree.map_tree;
- ret = btrfs_can_relocate(extent_root, chunk_offset);
- if (ret)
- return -ENOSPC;
-
- /* step one, relocate all the extents inside this chunk */
- ret = btrfs_relocate_block_group(extent_root, chunk_offset);
- if (ret)
- return ret;
-
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- btrfs_std_error(root->fs_info, ret);
- return ret;
- }
-
- lock_chunks(root);
-
- /*
- * step two, delete the device extents and the
- * chunk tree entries
- */
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, chunk_offset, 1);
read_unlock(&em_tree->lock);
- BUG_ON(!em || em->start > chunk_offset ||
- em->start + em->len < chunk_offset);
+ if (!em || em->start > chunk_offset ||
+ em->start + em->len < chunk_offset) {
+ /*
+ * This is a logic error, but we don't want to just rely on the
+ * user having built with ASSERT enabled, so if ASSERT doens't
+ * do anything we still error out.
+ */
+ ASSERT(0);
+ if (em)
+ free_extent_map(em);
+ return -EINVAL;
+ }
map = (struct map_lookup *)em->bdev;
for (i = 0; i < map->num_stripes; i++) {
- ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
- map->stripes[i].physical);
- BUG_ON(ret);
+ struct btrfs_device *device = map->stripes[i].dev;
+ ret = btrfs_free_dev_extent(trans, device,
+ map->stripes[i].physical,
+ &dev_extent_len);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+
+ if (device->bytes_used > 0) {
+ lock_chunks(root);
+ btrfs_device_set_bytes_used(device,
+ device->bytes_used - dev_extent_len);
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space += dev_extent_len;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+ btrfs_clear_space_info_full(root->fs_info);
+ unlock_chunks(root);
+ }
if (map->stripes[i].dev) {
ret = btrfs_update_device(trans, map->stripes[i].dev);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
}
}
ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
chunk_offset);
-
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
}
ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
write_lock(&em_tree->lock);
remove_extent_mapping(em_tree, em);
@@ -2618,12 +2659,46 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
/* once for the tree */
free_extent_map(em);
+out:
/* once for us */
free_extent_map(em);
+ return ret;
+}
- unlock_chunks(root);
+static int btrfs_relocate_chunk(struct btrfs_root *root,
+ u64 chunk_tree, u64 chunk_objectid,
+ u64 chunk_offset)
+{
+ struct btrfs_root *extent_root;
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ root = root->fs_info->chunk_root;
+ extent_root = root->fs_info->extent_root;
+
+ ret = btrfs_can_relocate(extent_root, chunk_offset);
+ if (ret)
+ return -ENOSPC;
+
+ /* step one, relocate all the extents inside this chunk */
+ ret = btrfs_relocate_block_group(extent_root, chunk_offset);
+ if (ret)
+ return ret;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_std_error(root->fs_info, ret);
+ return ret;
+ }
+
+ /*
+ * step two, delete the device extents and the
+ * chunk tree entries
+ */
+ ret = btrfs_remove_chunk(trans, root, chunk_offset);
btrfs_end_transaction(trans, root);
- return 0;
+ return ret;
}
static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
@@ -2676,8 +2751,8 @@ again:
found_key.offset);
if (ret == -ENOSPC)
failed++;
- else if (ret)
- BUG();
+ else
+ BUG_ON(ret);
}
if (found_key.offset == 0)
@@ -3084,11 +3159,12 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
/* step one make some room on all the devices */
devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
- old_size = device->total_bytes;
+ old_size = btrfs_device_get_total_bytes(device);
size_to_free = div_factor(old_size, 1);
size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
if (!device->writeable ||
- device->total_bytes - device->bytes_used > size_to_free ||
+ btrfs_device_get_total_bytes(device) -
+ btrfs_device_get_bytes_used(device) > size_to_free ||
device->is_tgtdev_for_dev_replace)
continue;
@@ -3643,8 +3719,6 @@ static int btrfs_uuid_scan_kthread(void *data)
max_key.type = BTRFS_ROOT_ITEM_KEY;
max_key.offset = (u64)-1;
- path->keep_locks = 1;
-
while (1) {
ret = btrfs_search_forward(root, &key, path, 0);
if (ret) {
@@ -3896,8 +3970,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
struct btrfs_key key;
struct btrfs_super_block *super_copy = root->fs_info->super_copy;
u64 old_total = btrfs_super_total_bytes(super_copy);
- u64 old_size = device->total_bytes;
- u64 diff = device->total_bytes - new_size;
+ u64 old_size = btrfs_device_get_total_bytes(device);
+ u64 diff = old_size - new_size;
if (device->is_tgtdev_for_dev_replace)
return -EINVAL;
@@ -3910,7 +3984,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
lock_chunks(root);
- device->total_bytes = new_size;
+ btrfs_device_set_total_bytes(device, new_size);
if (device->writeable) {
device->fs_devices->total_rw_bytes -= diff;
spin_lock(&root->fs_info->free_chunk_lock);
@@ -3976,7 +4050,7 @@ again:
ret = -ENOSPC;
lock_chunks(root);
- device->total_bytes = old_size;
+ btrfs_device_set_total_bytes(device, old_size);
if (device->writeable)
device->fs_devices->total_rw_bytes += diff;
spin_lock(&root->fs_info->free_chunk_lock);
@@ -3994,18 +4068,17 @@ again:
}
lock_chunks(root);
+ btrfs_device_set_disk_total_bytes(device, new_size);
+ if (list_empty(&device->resized_list))
+ list_add_tail(&device->resized_list,
+ &root->fs_info->fs_devices->resized_devices);
- device->disk_total_bytes = new_size;
- /* Now btrfs_update_device() will change the on-disk size. */
- ret = btrfs_update_device(trans, device);
- if (ret) {
- unlock_chunks(root);
- btrfs_end_transaction(trans, root);
- goto done;
- }
WARN_ON(diff > old_total);
btrfs_set_super_total_bytes(super_copy, old_total - diff);
unlock_chunks(root);
+
+ /* Now btrfs_update_device() will change the on-disk size. */
+ ret = btrfs_update_device(trans, device);
btrfs_end_transaction(trans, root);
done:
btrfs_free_path(path);
@@ -4021,10 +4094,13 @@ static int btrfs_add_system_chunk(struct btrfs_root *root,
u32 array_size;
u8 *ptr;
+ lock_chunks(root);
array_size = btrfs_super_sys_array_size(super_copy);
if (array_size + item_size + sizeof(disk_key)
- > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
+ > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+ unlock_chunks(root);
return -EFBIG;
+ }
ptr = super_copy->sys_chunk_array + array_size;
btrfs_cpu_key_to_disk(&disk_key, key);
@@ -4033,6 +4109,8 @@ static int btrfs_add_system_chunk(struct btrfs_root *root,
memcpy(ptr, chunk, item_size);
item_size += sizeof(disk_key);
btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
+ unlock_chunks(root);
+
return 0;
}
@@ -4402,6 +4480,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (ret)
goto error_del_extent;
+ for (i = 0; i < map->num_stripes; i++) {
+ num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
+ btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
+ }
+
+ spin_lock(&extent_root->fs_info->free_chunk_lock);
+ extent_root->fs_info->free_chunk_space -= (stripe_size *
+ map->num_stripes);
+ spin_unlock(&extent_root->fs_info->free_chunk_lock);
+
free_extent_map(em);
check_raid56_incompat_flag(extent_root->fs_info, type);
@@ -4473,7 +4561,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
device = map->stripes[i].dev;
dev_offset = map->stripes[i].physical;
- device->bytes_used += stripe_size;
ret = btrfs_update_device(trans, device);
if (ret)
goto out;
@@ -4486,11 +4573,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
goto out;
}
- spin_lock(&extent_root->fs_info->free_chunk_lock);
- extent_root->fs_info->free_chunk_space -= (stripe_size *
- map->num_stripes);
- spin_unlock(&extent_root->fs_info->free_chunk_lock);
-
stripe = &chunk->stripe;
for (i = 0; i < map->num_stripes; i++) {
device = map->stripes[i].dev;
@@ -4570,16 +4652,25 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
alloc_profile);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- goto out;
+ return ret;
+}
+
+static inline int btrfs_chunk_max_errors(struct map_lookup *map)
+{
+ int max_errors;
+
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_DUP)) {
+ max_errors = 1;
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
+ max_errors = 2;
+ } else {
+ max_errors = 0;
}
- ret = btrfs_add_device(trans, fs_info->chunk_root, device);
- if (ret)
- btrfs_abort_transaction(trans, root, ret);
-out:
- return ret;
+ return max_errors;
}
int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
@@ -4588,6 +4679,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
struct map_lookup *map;
struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
int readonly = 0;
+ int miss_ndevs = 0;
int i;
read_lock(&map_tree->map_tree.lock);
@@ -4596,18 +4688,27 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
if (!em)
return 1;
- if (btrfs_test_opt(root, DEGRADED)) {
- free_extent_map(em);
- return 0;
- }
-
map = (struct map_lookup *)em->bdev;
for (i = 0; i < map->num_stripes; i++) {
+ if (map->stripes[i].dev->missing) {
+ miss_ndevs++;
+ continue;
+ }
+
if (!map->stripes[i].dev->writeable) {
readonly = 1;
- break;
+ goto end;
}
}
+
+ /*
+ * If the number of missing devices is larger than max errors,
+ * we can not write the data into that chunk successfully, so
+ * set it readonly.
+ */
+ if (miss_ndevs > btrfs_chunk_max_errors(map))
+ readonly = 1;
+end:
free_extent_map(em);
return readonly;
}
@@ -5008,6 +5109,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
num_stripes = min_t(u64, map->num_stripes,
stripe_nr_end - stripe_nr_orig);
stripe_index = do_div(stripe_nr, map->num_stripes);
+ if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)))
+ mirror_num = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
num_stripes = map->num_stripes;
@@ -5111,6 +5214,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
/* We distribute the parity blocks across stripes */
tmp = stripe_nr + stripe_index;
stripe_index = do_div(tmp, map->num_stripes);
+ if (!(rw & (REQ_WRITE | REQ_DISCARD |
+ REQ_GET_READ_MIRRORS)) && mirror_num <= 1)
+ mirror_num = 1;
}
} else {
/*
@@ -5218,16 +5324,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
}
}
- if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
- if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10 |
- BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_DUP)) {
- max_errors = 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
- max_errors = 2;
- }
- }
+ if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
+ max_errors = btrfs_chunk_max_errors(map);
if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
dev_replace->tgtdev != NULL) {
@@ -5610,8 +5708,8 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
name = rcu_dereference(dev->name);
pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
"(%s id %llu), size=%u\n", rw,
- (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
- name->str, dev->devid, bio->bi_size);
+ (u64)bio->bi_iter.bi_sector, (u_long)dev->bdev->bd_dev,
+ name->str, dev->devid, bio->bi_iter.bi_size);
rcu_read_unlock();
}
#endif
@@ -5789,10 +5887,10 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
}
static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
+ struct btrfs_fs_devices *fs_devices,
u64 devid, u8 *dev_uuid)
{
struct btrfs_device *device;
- struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
device = btrfs_alloc_device(NULL, &devid, dev_uuid);
if (IS_ERR(device))
@@ -5929,7 +6027,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
}
if (!map->stripes[i].dev) {
map->stripes[i].dev =
- add_missing_dev(root, devid, uuid);
+ add_missing_dev(root, root->fs_info->fs_devices,
+ devid, uuid);
if (!map->stripes[i].dev) {
free_extent_map(em);
return -EIO;
@@ -5956,7 +6055,9 @@ static void fill_device_from_item(struct extent_buffer *leaf,
device->devid = btrfs_device_id(leaf, dev_item);
device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
device->total_bytes = device->disk_total_bytes;
+ device->commit_total_bytes = device->disk_total_bytes;
device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
+ device->commit_bytes_used = device->bytes_used;
device->type = btrfs_device_type(leaf, dev_item);
device->io_align = btrfs_device_io_align(leaf, dev_item);
device->io_width = btrfs_device_io_width(leaf, dev_item);
@@ -5968,7 +6069,8 @@ static void fill_device_from_item(struct extent_buffer *leaf,
read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
}
-static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
+static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root,
+ u8 *fsid)
{
struct btrfs_fs_devices *fs_devices;
int ret;
@@ -5977,49 +6079,56 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
fs_devices = root->fs_info->fs_devices->seed;
while (fs_devices) {
- if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
- ret = 0;
- goto out;
- }
+ if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE))
+ return fs_devices;
+
fs_devices = fs_devices->seed;
}
fs_devices = find_fsid(fsid);
if (!fs_devices) {
- ret = -ENOENT;
- goto out;
+ if (!btrfs_test_opt(root, DEGRADED))
+ return ERR_PTR(-ENOENT);
+
+ fs_devices = alloc_fs_devices(fsid);
+ if (IS_ERR(fs_devices))
+ return fs_devices;
+
+ fs_devices->seeding = 1;
+ fs_devices->opened = 1;
+ return fs_devices;
}
fs_devices = clone_fs_devices(fs_devices);
- if (IS_ERR(fs_devices)) {
- ret = PTR_ERR(fs_devices);
- goto out;
- }
+ if (IS_ERR(fs_devices))
+ return fs_devices;
ret = __btrfs_open_devices(fs_devices, FMODE_READ,
root->fs_info->bdev_holder);
if (ret) {
free_fs_devices(fs_devices);
+ fs_devices = ERR_PTR(ret);
goto out;
}
if (!fs_devices->seeding) {
__btrfs_close_devices(fs_devices);
free_fs_devices(fs_devices);
- ret = -EINVAL;
+ fs_devices = ERR_PTR(-EINVAL);
goto out;
}
fs_devices->seed = root->fs_info->fs_devices->seed;
root->fs_info->fs_devices->seed = fs_devices;
out:
- return ret;
+ return fs_devices;
}
static int read_one_dev(struct btrfs_root *root,
struct extent_buffer *leaf,
struct btrfs_dev_item *dev_item)
{
+ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
struct btrfs_device *device;
u64 devid;
int ret;
@@ -6033,31 +6142,48 @@ static int read_one_dev(struct btrfs_root *root,
BTRFS_UUID_SIZE);
if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
- ret = open_seed_devices(root, fs_uuid);
- if (ret && !btrfs_test_opt(root, DEGRADED))
- return ret;
+ fs_devices = open_seed_devices(root, fs_uuid);
+ if (IS_ERR(fs_devices))
+ return PTR_ERR(fs_devices);
}
device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
- if (!device || !device->bdev) {
+ if (!device) {
if (!btrfs_test_opt(root, DEGRADED))
return -EIO;
- if (!device) {
- btrfs_warn(root->fs_info, "devid %llu missing", devid);
- device = add_missing_dev(root, devid, dev_uuid);
- if (!device)
- return -ENOMEM;
- } else if (!device->missing) {
+ btrfs_warn(root->fs_info, "devid %llu missing", devid);
+ device = add_missing_dev(root, fs_devices, devid, dev_uuid);
+ if (!device)
+ return -ENOMEM;
+ } else {
+ if (!device->bdev && !btrfs_test_opt(root, DEGRADED))
+ return -EIO;
+
+ if(!device->bdev && !device->missing) {
/*
* this happens when a device that was properly setup
* in the device info lists suddenly goes bad.
* device->bdev is NULL, and so we have to set
* device->missing to one here
*/
- root->fs_info->fs_devices->missing_devices++;
+ device->fs_devices->missing_devices++;
device->missing = 1;
}
+
+ /* Move the device to its own fs_devices */
+ if (device->fs_devices != fs_devices) {
+ ASSERT(device->missing);
+
+ list_move(&device->dev_list, &fs_devices->devices);
+ device->fs_devices->num_devices--;
+ fs_devices->num_devices++;
+
+ device->fs_devices->missing_devices--;
+ fs_devices->missing_devices++;
+
+ device->fs_devices = fs_devices;
+ }
}
if (device->fs_devices != root->fs_info->fs_devices) {
@@ -6373,16 +6499,18 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
struct btrfs_root *dev_root = fs_info->dev_root;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
+ int stats_cnt;
int ret = 0;
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
- if (!device->dev_stats_valid || !device->dev_stats_dirty)
+ if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device))
continue;
+ stats_cnt = atomic_read(&device->dev_stats_ccnt);
ret = update_dev_stat_item(trans, dev_root, device);
if (!ret)
- device->dev_stats_dirty = 0;
+ atomic_sub(stats_cnt, &device->dev_stats_ccnt);
}
mutex_unlock(&fs_devices->device_list_mutex);
@@ -6481,3 +6609,51 @@ int btrfs_scratch_superblock(struct btrfs_device *device)
return 0;
}
+
+/*
+ * Update the size of all devices, which is used for writing out the
+ * super blocks.
+ */
+void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *curr, *next;
+
+ if (list_empty(&fs_devices->resized_devices))
+ return;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ lock_chunks(fs_info->dev_root);
+ list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
+ resized_list) {
+ list_del_init(&curr->resized_list);
+ curr->commit_total_bytes = curr->disk_total_bytes;
+ }
+ unlock_chunks(fs_info->dev_root);
+ mutex_unlock(&fs_devices->device_list_mutex);
+}
+
+/* Must be invoked during the transaction commit */
+void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
+ struct btrfs_transaction *transaction)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct btrfs_device *dev;
+ int i;
+
+ if (list_empty(&transaction->pending_chunks))
+ return;
+
+ /* In order to kick the device replace finish process */
+ lock_chunks(root);
+ list_for_each_entry(em, &transaction->pending_chunks, list) {
+ map = (struct map_lookup *)em->bdev;
+
+ for (i = 0; i < map->num_stripes; i++) {
+ dev = map->stripes[i].dev;
+ dev->commit_bytes_used = dev->bytes_used;
+ }
+ }
+ unlock_chunks(root);
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2aaa00c47816..08980fa23039 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -24,6 +24,8 @@
#include <linux/btrfs.h>
#include "async-thread.h"
+extern struct mutex uuid_mutex;
+
#define BTRFS_STRIPE_LEN (64 * 1024)
struct buffer_head;
@@ -32,41 +34,59 @@ struct btrfs_pending_bios {
struct bio *tail;
};
+/*
+ * Use sequence counter to get consistent device stat data on
+ * 32-bit processors.
+ */
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#include <linux/seqlock.h>
+#define __BTRFS_NEED_DEVICE_DATA_ORDERED
+#define btrfs_device_data_ordered_init(device) \
+ seqcount_init(&device->data_seqcount)
+#else
+#define btrfs_device_data_ordered_init(device) do { } while (0)
+#endif
+
struct btrfs_device {
struct list_head dev_list;
struct list_head dev_alloc_list;
struct btrfs_fs_devices *fs_devices;
+
struct btrfs_root *dev_root;
+ struct rcu_string *name;
+
+ u64 generation;
+
+ spinlock_t io_lock ____cacheline_aligned;
+ int running_pending;
/* regular prio bios */
struct btrfs_pending_bios pending_bios;
/* WRITE_SYNC bios */
struct btrfs_pending_bios pending_sync_bios;
- u64 generation;
- int running_pending;
+ struct block_device *bdev;
+
+ /* the mode sent to blkdev_get */
+ fmode_t mode;
+
int writeable;
int in_fs_metadata;
int missing;
int can_discard;
int is_tgtdev_for_dev_replace;
- spinlock_t io_lock;
- /* the mode sent to blkdev_get */
- fmode_t mode;
-
- struct block_device *bdev;
-
-
- struct rcu_string *name;
+#ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED
+ seqcount_t data_seqcount;
+#endif
/* the internal btrfs device id */
u64 devid;
- /* size of the device */
+ /* size of the device in memory */
u64 total_bytes;
- /* size of the disk */
+ /* size of the device on disk */
u64 disk_total_bytes;
/* bytes used */
@@ -83,10 +103,26 @@ struct btrfs_device {
/* minimal io size for this device */
u32 sector_size;
-
/* physical drive uuid (or lvm uuid) */
u8 uuid[BTRFS_UUID_SIZE];
+ /*
+ * size of the device on the current transaction
+ *
+ * This variant is update when committing the transaction,
+ * and protected by device_list_mutex
+ */
+ u64 commit_total_bytes;
+
+ /* bytes used on the current transaction */
+ u64 commit_bytes_used;
+ /*
+ * used to manage the device which is resized
+ *
+ * It is protected by chunk_lock.
+ */
+ struct list_head resized_list;
+
/* for sending down flush barriers */
int nobarriers;
struct bio *flush_bio;
@@ -107,26 +143,90 @@ struct btrfs_device {
struct radix_tree_root reada_zones;
struct radix_tree_root reada_extents;
-
/* disk I/O failure stats. For detailed description refer to
* enum btrfs_dev_stat_values in ioctl.h */
int dev_stats_valid;
- int dev_stats_dirty; /* counters need to be written to disk */
+
+ /* Counter to record the change of device stats */
+ atomic_t dev_stats_ccnt;
atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
};
+/*
+ * If we read those variants at the context of their own lock, we needn't
+ * use the following helpers, reading them directly is safe.
+ */
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#define BTRFS_DEVICE_GETSET_FUNCS(name) \
+static inline u64 \
+btrfs_device_get_##name(const struct btrfs_device *dev) \
+{ \
+ u64 size; \
+ unsigned int seq; \
+ \
+ do { \
+ seq = read_seqcount_begin(&dev->data_seqcount); \
+ size = dev->name; \
+ } while (read_seqcount_retry(&dev->data_seqcount, seq)); \
+ return size; \
+} \
+ \
+static inline void \
+btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
+{ \
+ preempt_disable(); \
+ write_seqcount_begin(&dev->data_seqcount); \
+ dev->name = size; \
+ write_seqcount_end(&dev->data_seqcount); \
+ preempt_enable(); \
+}
+#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
+#define BTRFS_DEVICE_GETSET_FUNCS(name) \
+static inline u64 \
+btrfs_device_get_##name(const struct btrfs_device *dev) \
+{ \
+ u64 size; \
+ \
+ preempt_disable(); \
+ size = dev->name; \
+ preempt_enable(); \
+ return size; \
+} \
+ \
+static inline void \
+btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
+{ \
+ preempt_disable(); \
+ dev->name = size; \
+ preempt_enable(); \
+}
+#else
+#define BTRFS_DEVICE_GETSET_FUNCS(name) \
+static inline u64 \
+btrfs_device_get_##name(const struct btrfs_device *dev) \
+{ \
+ return dev->name; \
+} \
+ \
+static inline void \
+btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
+{ \
+ dev->name = size; \
+}
+#endif
+
+BTRFS_DEVICE_GETSET_FUNCS(total_bytes);
+BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes);
+BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
+
struct btrfs_fs_devices {
u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
- /* the device with this id has the most recent copy of the super */
- u64 latest_devid;
- u64 latest_trans;
u64 num_devices;
u64 open_devices;
u64 rw_devices;
u64 missing_devices;
u64 total_rw_bytes;
- u64 num_can_discard;
u64 total_devices;
struct block_device *latest_bdev;
@@ -139,6 +239,7 @@ struct btrfs_fs_devices {
struct mutex device_list_mutex;
struct list_head devices;
+ struct list_head resized_devices;
/* devices not currently being allocated */
struct list_head alloc_list;
struct list_head list;
@@ -167,8 +268,9 @@ struct btrfs_fs_devices {
*/
typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err);
struct btrfs_io_bio {
- unsigned long mirror_num;
- unsigned long stripe_index;
+ unsigned int mirror_num;
+ unsigned int stripe_index;
+ u64 logical;
u8 *csum;
u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
u8 *csum_allocated;
@@ -325,6 +427,7 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
int btrfs_init_new_device(struct btrfs_root *root, char *path);
int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+ struct btrfs_device *srcdev,
struct btrfs_device **device_out);
int btrfs_balance(struct btrfs_balance_control *bctl,
struct btrfs_ioctl_balance_args *bargs);
@@ -360,11 +463,20 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root,
u64 chunk_offset, u64 chunk_size);
+int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 chunk_offset);
+
+static inline int btrfs_dev_stats_dirty(struct btrfs_device *dev)
+{
+ return atomic_read(&dev->dev_stats_ccnt);
+}
+
static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
int index)
{
atomic_inc(dev->dev_stat_values + index);
- dev->dev_stats_dirty = 1;
+ smp_mb__before_atomic();
+ atomic_inc(&dev->dev_stats_ccnt);
}
static inline int btrfs_dev_stat_read(struct btrfs_device *dev,
@@ -379,7 +491,8 @@ static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev,
int ret;
ret = atomic_xchg(dev->dev_stat_values + index, 0);
- dev->dev_stats_dirty = 1;
+ smp_mb__before_atomic();
+ atomic_inc(&dev->dev_stats_ccnt);
return ret;
}
@@ -387,7 +500,8 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
int index, unsigned long val)
{
atomic_set(dev->dev_stat_values + index, val);
- dev->dev_stats_dirty = 1;
+ smp_mb__before_atomic();
+ atomic_inc(&dev->dev_stats_ccnt);
}
static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
@@ -395,4 +509,8 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
{
btrfs_dev_stat_set(dev, index, 0);
}
+
+void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
+void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
+ struct btrfs_transaction *transaction);
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index ad8328d797ea..dcf20131fbe4 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -237,7 +237,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
* first xattr that we find and walk forward
*/
key.objectid = btrfs_ino(inode);
- btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+ key.type = BTRFS_XATTR_ITEM_KEY;
key.offset = 0;
path = btrfs_alloc_path();
@@ -273,7 +273,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
/* check to make sure this item is what we want */
if (found_key.objectid != key.objectid)
break;
- if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY)
+ if (found_key.type != BTRFS_XATTR_ITEM_KEY)
break;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index b67d8fc81277..759fa4e2de8f 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -33,8 +33,7 @@
#include "compression.h"
struct workspace {
- z_stream inf_strm;
- z_stream def_strm;
+ z_stream strm;
char *buf;
struct list_head list;
};
@@ -43,8 +42,7 @@ static void zlib_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
- vfree(workspace->def_strm.workspace);
- vfree(workspace->inf_strm.workspace);
+ vfree(workspace->strm.workspace);
kfree(workspace->buf);
kfree(workspace);
}
@@ -52,17 +50,17 @@ static void zlib_free_workspace(struct list_head *ws)
static struct list_head *zlib_alloc_workspace(void)
{
struct workspace *workspace;
+ int workspacesize;
workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
if (!workspace)
return ERR_PTR(-ENOMEM);
- workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize(
- MAX_WBITS, MAX_MEM_LEVEL));
- workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
+ workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
+ zlib_inflate_workspacesize());
+ workspace->strm.workspace = vmalloc(workspacesize);
workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
- if (!workspace->def_strm.workspace ||
- !workspace->inf_strm.workspace || !workspace->buf)
+ if (!workspace->strm.workspace || !workspace->buf)
goto fail;
INIT_LIST_HEAD(&workspace->list);
@@ -96,14 +94,14 @@ static int zlib_compress_pages(struct list_head *ws,
*total_out = 0;
*total_in = 0;
- if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
+ if (Z_OK != zlib_deflateInit(&workspace->strm, 3)) {
printk(KERN_WARNING "BTRFS: deflateInit failed\n");
ret = -EIO;
goto out;
}
- workspace->def_strm.total_in = 0;
- workspace->def_strm.total_out = 0;
+ workspace->strm.total_in = 0;
+ workspace->strm.total_out = 0;
in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
data_in = kmap(in_page);
@@ -117,25 +115,25 @@ static int zlib_compress_pages(struct list_head *ws,
pages[0] = out_page;
nr_pages = 1;
- workspace->def_strm.next_in = data_in;
- workspace->def_strm.next_out = cpage_out;
- workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
- workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
+ workspace->strm.next_in = data_in;
+ workspace->strm.next_out = cpage_out;
+ workspace->strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.avail_in = min(len, PAGE_CACHE_SIZE);
- while (workspace->def_strm.total_in < len) {
- ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
+ while (workspace->strm.total_in < len) {
+ ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH);
if (ret != Z_OK) {
printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
ret);
- zlib_deflateEnd(&workspace->def_strm);
+ zlib_deflateEnd(&workspace->strm);
ret = -EIO;
goto out;
}
/* we're making it bigger, give up */
- if (workspace->def_strm.total_in > 8192 &&
- workspace->def_strm.total_in <
- workspace->def_strm.total_out) {
+ if (workspace->strm.total_in > 8192 &&
+ workspace->strm.total_in <
+ workspace->strm.total_out) {
ret = -E2BIG;
goto out;
}
@@ -143,7 +141,7 @@ static int zlib_compress_pages(struct list_head *ws,
* before the total_in so we will pull in a new page for
* the stream end if required
*/
- if (workspace->def_strm.avail_out == 0) {
+ if (workspace->strm.avail_out == 0) {
kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
@@ -158,19 +156,19 @@ static int zlib_compress_pages(struct list_head *ws,
cpage_out = kmap(out_page);
pages[nr_pages] = out_page;
nr_pages++;
- workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
- workspace->def_strm.next_out = cpage_out;
+ workspace->strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.next_out = cpage_out;
}
/* we're all done */
- if (workspace->def_strm.total_in >= len)
+ if (workspace->strm.total_in >= len)
break;
/* we've read in a full page, get a new one */
- if (workspace->def_strm.avail_in == 0) {
- if (workspace->def_strm.total_out > max_out)
+ if (workspace->strm.avail_in == 0) {
+ if (workspace->strm.total_out > max_out)
break;
- bytes_left = len - workspace->def_strm.total_in;
+ bytes_left = len - workspace->strm.total_in;
kunmap(in_page);
page_cache_release(in_page);
@@ -178,28 +176,28 @@ static int zlib_compress_pages(struct list_head *ws,
in_page = find_get_page(mapping,
start >> PAGE_CACHE_SHIFT);
data_in = kmap(in_page);
- workspace->def_strm.avail_in = min(bytes_left,
+ workspace->strm.avail_in = min(bytes_left,
PAGE_CACHE_SIZE);
- workspace->def_strm.next_in = data_in;
+ workspace->strm.next_in = data_in;
}
}
- workspace->def_strm.avail_in = 0;
- ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
- zlib_deflateEnd(&workspace->def_strm);
+ workspace->strm.avail_in = 0;
+ ret = zlib_deflate(&workspace->strm, Z_FINISH);
+ zlib_deflateEnd(&workspace->strm);
if (ret != Z_STREAM_END) {
ret = -EIO;
goto out;
}
- if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
+ if (workspace->strm.total_out >= workspace->strm.total_in) {
ret = -E2BIG;
goto out;
}
ret = 0;
- *total_out = workspace->def_strm.total_out;
- *total_in = workspace->def_strm.total_in;
+ *total_out = workspace->strm.total_out;
+ *total_in = workspace->strm.total_in;
out:
*out_pages = nr_pages;
if (out_page)
@@ -225,19 +223,18 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
size_t total_out = 0;
unsigned long page_in_index = 0;
unsigned long page_out_index = 0;
- unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
- PAGE_CACHE_SIZE;
+ unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE);
unsigned long buf_start;
unsigned long pg_offset;
data_in = kmap(pages_in[page_in_index]);
- workspace->inf_strm.next_in = data_in;
- workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
- workspace->inf_strm.total_in = 0;
+ workspace->strm.next_in = data_in;
+ workspace->strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
+ workspace->strm.total_in = 0;
- workspace->inf_strm.total_out = 0;
- workspace->inf_strm.next_out = workspace->buf;
- workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.total_out = 0;
+ workspace->strm.next_out = workspace->buf;
+ workspace->strm.avail_out = PAGE_CACHE_SIZE;
pg_offset = 0;
/* If it's deflate, and it's got no preset dictionary, then
@@ -247,21 +244,21 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
!(((data_in[0]<<8) + data_in[1]) % 31)) {
wbits = -((data_in[0] >> 4) + 8);
- workspace->inf_strm.next_in += 2;
- workspace->inf_strm.avail_in -= 2;
+ workspace->strm.next_in += 2;
+ workspace->strm.avail_in -= 2;
}
- if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+ if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
printk(KERN_WARNING "BTRFS: inflateInit failed\n");
return -EIO;
}
- while (workspace->inf_strm.total_in < srclen) {
- ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+ while (workspace->strm.total_in < srclen) {
+ ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
if (ret != Z_OK && ret != Z_STREAM_END)
break;
buf_start = total_out;
- total_out = workspace->inf_strm.total_out;
+ total_out = workspace->strm.total_out;
/* we didn't make progress in this inflate call, we're done */
if (buf_start == total_out)
@@ -276,10 +273,10 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
goto done;
}
- workspace->inf_strm.next_out = workspace->buf;
- workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.next_out = workspace->buf;
+ workspace->strm.avail_out = PAGE_CACHE_SIZE;
- if (workspace->inf_strm.avail_in == 0) {
+ if (workspace->strm.avail_in == 0) {
unsigned long tmp;
kunmap(pages_in[page_in_index]);
page_in_index++;
@@ -288,9 +285,9 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
break;
}
data_in = kmap(pages_in[page_in_index]);
- workspace->inf_strm.next_in = data_in;
- tmp = srclen - workspace->inf_strm.total_in;
- workspace->inf_strm.avail_in = min(tmp,
+ workspace->strm.next_in = data_in;
+ tmp = srclen - workspace->strm.total_in;
+ workspace->strm.avail_in = min(tmp,
PAGE_CACHE_SIZE);
}
}
@@ -299,7 +296,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
else
ret = 0;
done:
- zlib_inflateEnd(&workspace->inf_strm);
+ zlib_inflateEnd(&workspace->strm);
if (data_in)
kunmap(pages_in[page_in_index]);
return ret;
@@ -317,13 +314,13 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
unsigned long total_out = 0;
char *kaddr;
- workspace->inf_strm.next_in = data_in;
- workspace->inf_strm.avail_in = srclen;
- workspace->inf_strm.total_in = 0;
+ workspace->strm.next_in = data_in;
+ workspace->strm.avail_in = srclen;
+ workspace->strm.total_in = 0;
- workspace->inf_strm.next_out = workspace->buf;
- workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
- workspace->inf_strm.total_out = 0;
+ workspace->strm.next_out = workspace->buf;
+ workspace->strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.total_out = 0;
/* If it's deflate, and it's got no preset dictionary, then
we can tell zlib to skip the adler32 check. */
if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
@@ -331,11 +328,11 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
!(((data_in[0]<<8) + data_in[1]) % 31)) {
wbits = -((data_in[0] >> 4) + 8);
- workspace->inf_strm.next_in += 2;
- workspace->inf_strm.avail_in -= 2;
+ workspace->strm.next_in += 2;
+ workspace->strm.avail_in -= 2;
}
- if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+ if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
printk(KERN_WARNING "BTRFS: inflateInit failed\n");
return -EIO;
}
@@ -346,12 +343,12 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
unsigned long bytes;
unsigned long pg_offset = 0;
- ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+ ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
if (ret != Z_OK && ret != Z_STREAM_END)
break;
buf_start = total_out;
- total_out = workspace->inf_strm.total_out;
+ total_out = workspace->strm.total_out;
if (total_out == buf_start) {
ret = -EIO;
@@ -377,8 +374,8 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
pg_offset += bytes;
bytes_left -= bytes;
next:
- workspace->inf_strm.next_out = workspace->buf;
- workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.next_out = workspace->buf;
+ workspace->strm.avail_out = PAGE_CACHE_SIZE;
}
if (ret != Z_STREAM_END && bytes_left != 0)
@@ -386,7 +383,7 @@ next:
else
ret = 0;
- zlib_inflateEnd(&workspace->inf_strm);
+ zlib_inflateEnd(&workspace->strm);
return ret;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 3588a80854b2..9614adc7e754 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1253,7 +1253,7 @@ static struct buffer_head *__bread_slow(struct buffer_head *bh)
* a local interrupt disable for that.
*/
-#define BH_LRU_SIZE 8
+#define BH_LRU_SIZE 16
struct bh_lru {
struct buffer_head *bhs[BH_LRU_SIZE];
@@ -1331,8 +1331,8 @@ lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
for (i = 0; i < BH_LRU_SIZE; i++) {
struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
- if (bh && bh->b_bdev == bdev &&
- bh->b_blocknr == block && bh->b_size == size) {
+ if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
+ bh->b_size == size) {
if (i) {
while (i) {
__this_cpu_write(bh_lrus.bhs[i],
@@ -2318,6 +2318,11 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
err = 0;
balance_dirty_pages_ratelimited(mapping);
+
+ if (unlikely(fatal_signal_pending(current))) {
+ err = -EINTR;
+ goto out;
+ }
}
/* page covers the boundary, find the boundary offset */
@@ -2956,7 +2961,7 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
/*
* This allows us to do IO even on the odd last sectors
- * of a device, even if the bh block size is some multiple
+ * of a device, even if the block size is some multiple
* of the physical sector size.
*
* We'll just truncate the bio to the size of the device,
@@ -2966,10 +2971,11 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
* errors, this only handles the "we need to be able to
* do IO at the final sector" case.
*/
-static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
+void guard_bio_eod(int rw, struct bio *bio)
{
sector_t maxsector;
- unsigned bytes;
+ struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
+ unsigned truncated_bytes;
maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
if (!maxsector)
@@ -2984,23 +2990,20 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
return;
maxsector -= bio->bi_iter.bi_sector;
- bytes = bio->bi_iter.bi_size;
- if (likely((bytes >> 9) <= maxsector))
+ if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
return;
- /* Uhhuh. We've got a bh that straddles the device size! */
- bytes = maxsector << 9;
+ /* Uhhuh. We've got a bio that straddles the device size! */
+ truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
/* Truncate the bio.. */
- bio->bi_iter.bi_size = bytes;
- bio->bi_io_vec[0].bv_len = bytes;
+ bio->bi_iter.bi_size -= truncated_bytes;
+ bvec->bv_len -= truncated_bytes;
/* ..and clear the end of the buffer for reads */
if ((rw & RW_MASK) == READ) {
- void *kaddr = kmap_atomic(bh->b_page);
- memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes);
- kunmap_atomic(kaddr);
- flush_dcache_page(bh->b_page);
+ zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
+ truncated_bytes);
}
}
@@ -3041,7 +3044,7 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
bio->bi_flags |= bio_flags;
/* Take care of bh's that straddle the end of the device */
- guard_bh_eod(rw, bio, bh);
+ guard_bio_eod(rw, bio);
if (buffer_meta(bh))
rw |= REQ_META;
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 25e745b8eb1b..616db0e77b44 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -880,7 +880,6 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
{
struct cachefiles_object *object;
struct cachefiles_cache *cache;
- mm_segment_t old_fs;
struct file *file;
struct path path;
loff_t pos, eof;
@@ -914,36 +913,27 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
if (IS_ERR(file)) {
ret = PTR_ERR(file);
} else {
- ret = -EIO;
- if (file->f_op->write) {
- pos = (loff_t) page->index << PAGE_SHIFT;
-
- /* we mustn't write more data than we have, so we have
- * to beware of a partial page at EOF */
- eof = object->fscache.store_limit_l;
- len = PAGE_SIZE;
- if (eof & ~PAGE_MASK) {
- ASSERTCMP(pos, <, eof);
- if (eof - pos < PAGE_SIZE) {
- _debug("cut short %llx to %llx",
- pos, eof);
- len = eof - pos;
- ASSERTCMP(pos + len, ==, eof);
- }
+ pos = (loff_t) page->index << PAGE_SHIFT;
+
+ /* we mustn't write more data than we have, so we have
+ * to beware of a partial page at EOF */
+ eof = object->fscache.store_limit_l;
+ len = PAGE_SIZE;
+ if (eof & ~PAGE_MASK) {
+ ASSERTCMP(pos, <, eof);
+ if (eof - pos < PAGE_SIZE) {
+ _debug("cut short %llx to %llx",
+ pos, eof);
+ len = eof - pos;
+ ASSERTCMP(pos + len, ==, eof);
}
-
- data = kmap(page);
- file_start_write(file);
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- ret = file->f_op->write(
- file, (const void __user *) data, len, &pos);
- set_fs(old_fs);
- kunmap(page);
- file_end_write(file);
- if (ret != len)
- ret = -EIO;
}
+
+ data = kmap(page);
+ ret = __kernel_write(file, data, len, &pos);
+ kunmap(page);
+ if (ret != len)
+ ret = -EIO;
fput(file);
}
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index c29d6ae68874..b6c59eaa4f64 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1069,7 +1069,6 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
ceph_dentry_lru_touch(dentry);
} else {
ceph_dir_clear_complete(dir);
- d_drop(dentry);
}
iput(dir);
return valid;
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 58df174deb10..b8602f199815 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -195,15 +195,15 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
else
noff = tkn_e - (sb_mountdata + off) + 1;
- if (strnicmp(sb_mountdata + off, "unc=", 4) == 0) {
+ if (strncasecmp(sb_mountdata + off, "unc=", 4) == 0) {
off += noff;
continue;
}
- if (strnicmp(sb_mountdata + off, "ip=", 3) == 0) {
+ if (strncasecmp(sb_mountdata + off, "ip=", 3) == 0) {
off += noff;
continue;
}
- if (strnicmp(sb_mountdata + off, "prefixpath=", 11) == 0) {
+ if (strncasecmp(sb_mountdata + off, "prefixpath=", 11) == 0) {
off += noff;
continue;
}
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index a3e932547617..f4cf200b3c76 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -62,7 +62,6 @@ cifs_spnego_key_destroy(struct key *key)
struct key_type cifs_spnego_key_type = {
.name = "cifs.spnego",
.instantiate = cifs_spnego_key_instantiate,
- .match = user_match,
.destroy = cifs_spnego_key_destroy,
.describe = user_describe,
};
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 7ff866dbb89e..6d00c419cbae 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -84,7 +84,6 @@ static struct key_type cifs_idmap_key_type = {
.instantiate = cifs_idmap_key_instantiate,
.destroy = cifs_idmap_key_destroy,
.describe = user_describe,
- .match = user_match,
};
static char *
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 889b98455750..9d7996e8e793 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -813,7 +813,8 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
return generic_file_llseek(file, offset, whence);
}
-static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
+static int
+cifs_setlease(struct file *file, long arg, struct file_lock **lease, void **priv)
{
/*
* Note that this is called by vfs setlease with i_lock held to
@@ -829,7 +830,7 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
if (arg == F_UNLCK ||
((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) ||
((arg == F_WRLCK) && CIFS_CACHE_WRITE(CIFS_I(inode))))
- return generic_setlease(file, arg, lease);
+ return generic_setlease(file, arg, lease, priv);
else if (tlink_tcon(cfile->tlink)->local_lease &&
!CIFS_CACHE_READ(CIFS_I(inode)))
/*
@@ -840,7 +841,7 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
* knows that the file won't be changed on the server by anyone
* else.
*/
- return generic_setlease(file, arg, lease);
+ return generic_setlease(file, arg, lease, priv);
else
return -EAGAIN;
}
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 36ca2045009b..239e1fb33000 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1718,7 +1718,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
goto cifs_parse_mount_err;
}
- if (strnicmp(string, "default", 7) != 0) {
+ if (strncasecmp(string, "default", 7) != 0) {
vol->iocharset = kstrdup(string,
GFP_KERNEL);
if (!vol->iocharset) {
@@ -1790,7 +1790,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
if (string == NULL)
goto out_nomem;
- if (strnicmp(string, "1", 1) == 0) {
+ if (strncasecmp(string, "1", 1) == 0) {
/* This is the default */
break;
}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 6cbd9c688cfe..073640675a39 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -461,8 +461,8 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
xid = get_xid();
- cifs_dbg(FYI, "parent inode = 0x%p name is: %s and dentry = 0x%p\n",
- inode, direntry->d_name.name, direntry);
+ cifs_dbg(FYI, "parent inode = 0x%p name is: %pd and dentry = 0x%p\n",
+ inode, direntry, direntry);
tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb));
if (IS_ERR(tlink)) {
@@ -540,8 +540,8 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,
struct cifs_fid fid;
__u32 oplock;
- cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %s and dentry = 0x%p\n",
- inode, direntry->d_name.name, direntry);
+ cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %pd and dentry = 0x%p\n",
+ inode, direntry, direntry);
tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb));
rc = PTR_ERR(tlink);
@@ -713,8 +713,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
xid = get_xid();
- cifs_dbg(FYI, "parent inode = 0x%p name is: %s and dentry = 0x%p\n",
- parent_dir_inode, direntry->d_name.name, direntry);
+ cifs_dbg(FYI, "parent inode = 0x%p name is: %pd and dentry = 0x%p\n",
+ parent_dir_inode, direntry, direntry);
/* check whether path exists */
@@ -833,7 +833,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
{
int rc = 0;
- cifs_dbg(FYI, "In cifs d_delete, name = %s\n", direntry->d_name.name);
+ cifs_dbg(FYI, "In cifs d_delete, name = %pd\n", direntry);
return rc;
} */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 5f29354b072a..8f7b40fd8f3b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1650,8 +1650,8 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data,
cifs_sb = CIFS_SB(dentry->d_sb);
- cifs_dbg(FYI, "write %zd bytes to offset %lld of %s\n",
- write_size, *offset, dentry->d_name.name);
+ cifs_dbg(FYI, "write %zd bytes to offset %lld of %pd\n",
+ write_size, *offset, dentry);
tcon = tlink_tcon(open_file->tlink);
server = tcon->ses->server;
@@ -2273,8 +2273,8 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
xid = get_xid();
- cifs_dbg(FYI, "Sync file - name: %s datasync: 0x%x\n",
- file->f_path.dentry->d_name.name, datasync);
+ cifs_dbg(FYI, "Sync file - name: %pD datasync: 0x%x\n",
+ file, datasync);
if (!CIFS_CACHE_READ(CIFS_I(inode))) {
rc = cifs_zap_mapping(inode);
@@ -2315,8 +2315,8 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
xid = get_xid();
- cifs_dbg(FYI, "Sync file - name: %s datasync: 0x%x\n",
- file->f_path.dentry->d_name.name, datasync);
+ cifs_dbg(FYI, "Sync file - name: %pD datasync: 0x%x\n",
+ file, datasync);
tcon = tlink_tcon(smbfile->tlink);
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 7899a40465b3..8fd4ee8e07ff 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1419,8 +1419,8 @@ cifs_posix_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode,
d_instantiate(dentry, newinode);
#ifdef CONFIG_CIFS_DEBUG2
- cifs_dbg(FYI, "instantiated dentry %p %s to inode %p\n",
- dentry, dentry->d_name.name, newinode);
+ cifs_dbg(FYI, "instantiated dentry %p %pd to inode %p\n",
+ dentry, dentry, newinode);
if (newinode->i_nlink != 2)
cifs_dbg(FYI, "unexpected number of links %d\n",
@@ -2111,8 +2111,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
struct cifs_unix_set_info_args *args = NULL;
struct cifsFileInfo *open_file;
- cifs_dbg(FYI, "setattr_unix on file %s attrs->ia_valid=0x%x\n",
- direntry->d_name.name, attrs->ia_valid);
+ cifs_dbg(FYI, "setattr_unix on file %pd attrs->ia_valid=0x%x\n",
+ direntry, attrs->ia_valid);
xid = get_xid();
@@ -2254,8 +2254,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
xid = get_xid();
- cifs_dbg(FYI, "setattr on file %s attrs->iavalid 0x%x\n",
- direntry->d_name.name, attrs->ia_valid);
+ cifs_dbg(FYI, "setattr on file %pd attrs->iavalid 0x%x\n",
+ direntry, attrs->ia_valid);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
attrs->ia_valid |= ATTR_FORCE;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index b334a89d6a66..d2141f101382 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -87,8 +87,6 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
return;
if (dentry) {
- int err;
-
inode = dentry->d_inode;
if (inode) {
/*
@@ -105,10 +103,8 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
goto out;
}
}
- err = d_invalidate(dentry);
+ d_invalidate(dentry);
dput(dentry);
- if (err)
- return;
}
/*
diff --git a/fs/compat.c b/fs/compat.c
index 66d3d3c6b4b2..b13df99f3534 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -794,25 +794,21 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
char *kernel_type;
unsigned long data_page;
char *kernel_dev;
- struct filename *dir;
int retval;
- retval = copy_mount_string(type, &kernel_type);
- if (retval < 0)
+ kernel_type = copy_mount_string(type);
+ retval = PTR_ERR(kernel_type);
+ if (IS_ERR(kernel_type))
goto out;
- dir = getname(dir_name);
- retval = PTR_ERR(dir);
- if (IS_ERR(dir))
+ kernel_dev = copy_mount_string(dev_name);
+ retval = PTR_ERR(kernel_dev);
+ if (IS_ERR(kernel_dev))
goto out1;
- retval = copy_mount_string(dev_name, &kernel_dev);
- if (retval < 0)
- goto out2;
-
retval = copy_mount_options(data, &data_page);
if (retval < 0)
- goto out3;
+ goto out2;
retval = -EINVAL;
@@ -821,19 +817,17 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
do_ncp_super_data_conv((void *)data_page);
} else if (!strcmp(kernel_type, NFS4_NAME)) {
if (do_nfs4_super_data_conv((void *) data_page))
- goto out4;
+ goto out3;
}
}
- retval = do_mount(kernel_dev, dir->name, kernel_type,
+ retval = do_mount(kernel_dev, dir_name, kernel_type,
flags, (void*)data_page);
- out4:
- free_page(data_page);
out3:
- kfree(kernel_dev);
+ free_page(data_page);
out2:
- putname(dir);
+ kfree(kernel_dev);
out1:
kfree(kernel_type);
out:
diff --git a/fs/coredump.c b/fs/coredump.c
index a93f7e6ea4cf..b5c86ffd5033 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -199,6 +199,14 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
err = cn_printf(cn, "%d",
task_tgid_nr(current));
break;
+ case 'i':
+ err = cn_printf(cn, "%d",
+ task_pid_vnr(current));
+ break;
+ case 'I':
+ err = cn_printf(cn, "%d",
+ task_pid_nr(current));
+ break;
/* uid */
case 'u':
err = cn_printf(cn, "%d", cred->uid);
diff --git a/fs/dcache.c b/fs/dcache.c
index cb25a1a5e307..d5a23fd0da90 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -235,18 +235,49 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c
return dentry_string_cmp(cs, ct, tcount);
}
+struct external_name {
+ union {
+ atomic_t count;
+ struct rcu_head head;
+ } u;
+ unsigned char name[];
+};
+
+static inline struct external_name *external_name(struct dentry *dentry)
+{
+ return container_of(dentry->d_name.name, struct external_name, name[0]);
+}
+
static void __d_free(struct rcu_head *head)
{
struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
WARN_ON(!hlist_unhashed(&dentry->d_alias));
- if (dname_external(dentry))
- kfree(dentry->d_name.name);
kmem_cache_free(dentry_cache, dentry);
}
+static void __d_free_external(struct rcu_head *head)
+{
+ struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
+ WARN_ON(!hlist_unhashed(&dentry->d_alias));
+ kfree(external_name(dentry));
+ kmem_cache_free(dentry_cache, dentry);
+}
+
+static inline int dname_external(const struct dentry *dentry)
+{
+ return dentry->d_name.name != dentry->d_iname;
+}
+
static void dentry_free(struct dentry *dentry)
{
+ if (unlikely(dname_external(dentry))) {
+ struct external_name *p = external_name(dentry);
+ if (likely(atomic_dec_and_test(&p->u.count))) {
+ call_rcu(&dentry->d_u.d_rcu, __d_free_external);
+ return;
+ }
+ }
/* if dentry was never visible to RCU, immediate free is OK */
if (!(dentry->d_flags & DCACHE_RCUACCESS))
__d_free(&dentry->d_u.d_rcu);
@@ -456,7 +487,7 @@ static void __dentry_kill(struct dentry *dentry)
* inform the fs via d_prune that this dentry is about to be
* unhashed and destroyed.
*/
- if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry))
+ if (dentry->d_flags & DCACHE_OP_PRUNE)
dentry->d_op->d_prune(dentry);
if (dentry->d_flags & DCACHE_LRU_LIST) {
@@ -619,62 +650,6 @@ kill_it:
}
EXPORT_SYMBOL(dput);
-/**
- * d_invalidate - invalidate a dentry
- * @dentry: dentry to invalidate
- *
- * Try to invalidate the dentry if it turns out to be
- * possible. If there are other dentries that can be
- * reached through this one we can't delete it and we
- * return -EBUSY. On success we return 0.
- *
- * no dcache lock.
- */
-
-int d_invalidate(struct dentry * dentry)
-{
- /*
- * If it's already been dropped, return OK.
- */
- spin_lock(&dentry->d_lock);
- if (d_unhashed(dentry)) {
- spin_unlock(&dentry->d_lock);
- return 0;
- }
- /*
- * Check whether to do a partial shrink_dcache
- * to get rid of unused child entries.
- */
- if (!list_empty(&dentry->d_subdirs)) {
- spin_unlock(&dentry->d_lock);
- shrink_dcache_parent(dentry);
- spin_lock(&dentry->d_lock);
- }
-
- /*
- * Somebody else still using it?
- *
- * If it's a directory, we can't drop it
- * for fear of somebody re-populating it
- * with children (even though dropping it
- * would make it unreachable from the root,
- * we might still populate it if it was a
- * working directory or similar).
- * We also need to leave mountpoints alone,
- * directory or not.
- */
- if (dentry->d_lockref.count > 1 && dentry->d_inode) {
- if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) {
- spin_unlock(&dentry->d_lock);
- return -EBUSY;
- }
- }
-
- __d_drop(dentry);
- spin_unlock(&dentry->d_lock);
- return 0;
-}
-EXPORT_SYMBOL(d_invalidate);
/* This must be called with d_lock held */
static inline void __dget_dlock(struct dentry *dentry)
@@ -735,7 +710,8 @@ EXPORT_SYMBOL(dget_parent);
* acquire the reference to alias and return it. Otherwise return NULL.
* Notice that if inode is a directory there can be only one alias and
* it can be unhashed only if it has no children, or if it is the root
- * of a filesystem.
+ * of a filesystem, or if the directory was renamed and d_revalidate
+ * was the first vfs operation to notice.
*
* If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
* any other hashed alias over that one.
@@ -799,20 +775,13 @@ restart:
hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
spin_lock(&dentry->d_lock);
if (!dentry->d_lockref.count) {
- /*
- * inform the fs via d_prune that this dentry
- * is about to be unhashed and destroyed.
- */
- if ((dentry->d_flags & DCACHE_OP_PRUNE) &&
- !d_unhashed(dentry))
- dentry->d_op->d_prune(dentry);
-
- __dget_dlock(dentry);
- __d_drop(dentry);
- spin_unlock(&dentry->d_lock);
- spin_unlock(&inode->i_lock);
- dput(dentry);
- goto restart;
+ struct dentry *parent = lock_parent(dentry);
+ if (likely(!dentry->d_lockref.count)) {
+ __dentry_kill(dentry);
+ goto restart;
+ }
+ if (parent)
+ spin_unlock(&parent->d_lock);
}
spin_unlock(&dentry->d_lock);
}
@@ -1193,7 +1162,7 @@ EXPORT_SYMBOL(have_submounts);
* reachable (e.g. NFS can unhash a directory dentry and then the complete
* subtree can become unreachable).
*
- * Only one of check_submounts_and_drop() and d_set_mounted() must succeed. For
+ * Only one of d_invalidate() and d_set_mounted() must succeed. For
* this reason take rename_lock and d_lock on dentry and ancestors.
*/
int d_set_mounted(struct dentry *dentry)
@@ -1202,7 +1171,7 @@ int d_set_mounted(struct dentry *dentry)
int ret = -ENOENT;
write_seqlock(&rename_lock);
for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) {
- /* Need exclusion wrt. check_submounts_and_drop() */
+ /* Need exclusion wrt. d_invalidate() */
spin_lock(&p->d_lock);
if (unlikely(d_unhashed(p))) {
spin_unlock(&p->d_lock);
@@ -1346,70 +1315,84 @@ void shrink_dcache_for_umount(struct super_block *sb)
}
}
-static enum d_walk_ret check_and_collect(void *_data, struct dentry *dentry)
+struct detach_data {
+ struct select_data select;
+ struct dentry *mountpoint;
+};
+static enum d_walk_ret detach_and_collect(void *_data, struct dentry *dentry)
{
- struct select_data *data = _data;
+ struct detach_data *data = _data;
if (d_mountpoint(dentry)) {
- data->found = -EBUSY;
+ __dget_dlock(dentry);
+ data->mountpoint = dentry;
return D_WALK_QUIT;
}
- return select_collect(_data, dentry);
+ return select_collect(&data->select, dentry);
}
static void check_and_drop(void *_data)
{
- struct select_data *data = _data;
+ struct detach_data *data = _data;
- if (d_mountpoint(data->start))
- data->found = -EBUSY;
- if (!data->found)
- __d_drop(data->start);
+ if (!data->mountpoint && !data->select.found)
+ __d_drop(data->select.start);
}
/**
- * check_submounts_and_drop - prune dcache, check for submounts and drop
+ * d_invalidate - detach submounts, prune dcache, and drop
+ * @dentry: dentry to invalidate (aka detach, prune and drop)
*
- * All done as a single atomic operation relative to has_unlinked_ancestor().
- * Returns 0 if successfully unhashed @parent. If there were submounts then
- * return -EBUSY.
+ * no dcache lock.
*
- * @dentry: dentry to prune and drop
+ * The final d_drop is done as an atomic operation relative to
+ * rename_lock ensuring there are no races with d_set_mounted. This
+ * ensures there are no unhashed dentries on the path to a mountpoint.
*/
-int check_submounts_and_drop(struct dentry *dentry)
+void d_invalidate(struct dentry *dentry)
{
- int ret = 0;
+ /*
+ * If it's already been dropped, return OK.
+ */
+ spin_lock(&dentry->d_lock);
+ if (d_unhashed(dentry)) {
+ spin_unlock(&dentry->d_lock);
+ return;
+ }
+ spin_unlock(&dentry->d_lock);
/* Negative dentries can be dropped without further checks */
if (!dentry->d_inode) {
d_drop(dentry);
- goto out;
+ return;
}
for (;;) {
- struct select_data data;
+ struct detach_data data;
- INIT_LIST_HEAD(&data.dispose);
- data.start = dentry;
- data.found = 0;
+ data.mountpoint = NULL;
+ INIT_LIST_HEAD(&data.select.dispose);
+ data.select.start = dentry;
+ data.select.found = 0;
+
+ d_walk(dentry, &data, detach_and_collect, check_and_drop);
- d_walk(dentry, &data, check_and_collect, check_and_drop);
- ret = data.found;
+ if (data.select.found)
+ shrink_dentry_list(&data.select.dispose);
- if (!list_empty(&data.dispose))
- shrink_dentry_list(&data.dispose);
+ if (data.mountpoint) {
+ detach_mounts(data.mountpoint);
+ dput(data.mountpoint);
+ }
- if (ret <= 0)
+ if (!data.mountpoint && !data.select.found)
break;
cond_resched();
}
-
-out:
- return ret;
}
-EXPORT_SYMBOL(check_submounts_and_drop);
+EXPORT_SYMBOL(d_invalidate);
/**
* __d_alloc - allocate a dcache entry
@@ -1438,11 +1421,14 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
*/
dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
if (name->len > DNAME_INLINE_LEN-1) {
- dname = kmalloc(name->len + 1, GFP_KERNEL);
- if (!dname) {
+ size_t size = offsetof(struct external_name, name[1]);
+ struct external_name *p = kmalloc(size + name->len, GFP_KERNEL);
+ if (!p) {
kmem_cache_free(dentry_cache, dentry);
return NULL;
}
+ atomic_set(&p->u.count, 1);
+ dname = p->name;
} else {
dname = dentry->d_iname;
}
@@ -2112,10 +2098,10 @@ struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name)
struct dentry *dentry;
unsigned seq;
- do {
- seq = read_seqbegin(&rename_lock);
- dentry = __d_lookup(parent, name);
- if (dentry)
+ do {
+ seq = read_seqbegin(&rename_lock);
+ dentry = __d_lookup(parent, name);
+ if (dentry)
break;
} while (read_seqretry(&rename_lock, seq));
return dentry;
@@ -2372,11 +2358,10 @@ void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
}
EXPORT_SYMBOL(dentry_update_name_case);
-static void switch_names(struct dentry *dentry, struct dentry *target,
- bool exchange)
+static void swap_names(struct dentry *dentry, struct dentry *target)
{
- if (dname_external(target)) {
- if (dname_external(dentry)) {
+ if (unlikely(dname_external(target))) {
+ if (unlikely(dname_external(dentry))) {
/*
* Both external: swap the pointers
*/
@@ -2392,7 +2377,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target,
target->d_name.name = target->d_iname;
}
} else {
- if (dname_external(dentry)) {
+ if (unlikely(dname_external(dentry))) {
/*
* dentry:external, target:internal. Give dentry's
* storage to target and make dentry internal
@@ -2407,12 +2392,6 @@ static void switch_names(struct dentry *dentry, struct dentry *target,
*/
unsigned int i;
BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
- if (!exchange) {
- memcpy(dentry->d_iname, target->d_name.name,
- target->d_name.len + 1);
- dentry->d_name.hash_len = target->d_name.hash_len;
- return;
- }
for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
swap(((long *) &dentry->d_iname)[i],
((long *) &target->d_iname)[i]);
@@ -2422,6 +2401,24 @@ static void switch_names(struct dentry *dentry, struct dentry *target,
swap(dentry->d_name.hash_len, target->d_name.hash_len);
}
+static void copy_name(struct dentry *dentry, struct dentry *target)
+{
+ struct external_name *old_name = NULL;
+ if (unlikely(dname_external(dentry)))
+ old_name = external_name(dentry);
+ if (unlikely(dname_external(target))) {
+ atomic_inc(&external_name(target)->u.count);
+ dentry->d_name = target->d_name;
+ } else {
+ memcpy(dentry->d_iname, target->d_name.name,
+ target->d_name.len + 1);
+ dentry->d_name.name = dentry->d_iname;
+ dentry->d_name.hash_len = target->d_name.hash_len;
+ }
+ if (old_name && likely(atomic_dec_and_test(&old_name->u.count)))
+ kfree_rcu(old_name, u.head);
+}
+
static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
{
/*
@@ -2518,7 +2515,10 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
}
/* Switch the names.. */
- switch_names(dentry, target, exchange);
+ if (exchange)
+ swap_names(dentry, target);
+ else
+ copy_name(dentry, target);
/* ... and switch them in the tree */
if (IS_ROOT(dentry)) {
@@ -2625,10 +2625,8 @@ static struct dentry *__d_unalias(struct inode *inode,
goto out_err;
m2 = &alias->d_parent->d_inode->i_mutex;
out_unalias:
- if (likely(!d_mountpoint(alias))) {
- __d_move(alias, dentry, false);
- ret = alias;
- }
+ __d_move(alias, dentry, false);
+ ret = alias;
out_err:
spin_unlock(&inode->i_lock);
if (m2)
@@ -2810,6 +2808,9 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen)
* the beginning of the name. The sequence number check at the caller will
* retry it again when a d_move() does happen. So any garbage in the buffer
* due to mismatched pointer and length will be discarded.
+ *
+ * Data dependency barrier is needed to make sure that we see that terminating
+ * NUL. Alpha strikes again, film at 11...
*/
static int prepend_name(char **buffer, int *buflen, struct qstr *name)
{
@@ -2817,6 +2818,8 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
u32 dlen = ACCESS_ONCE(name->len);
char *p;
+ smp_read_barrier_depends();
+
*buflen -= dlen + 1;
if (*buflen < 0)
return -ENAMETOOLONG;
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index f704458ea5f5..e0ab3a93eeff 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -30,7 +30,7 @@ struct plock_op {
struct plock_xop {
struct plock_op xop;
- void *callback;
+ int (*callback)(struct file_lock *fl, int result);
void *fl;
void *file;
struct file_lock flc;
@@ -190,7 +190,7 @@ static int dlm_plock_callback(struct plock_op *op)
struct file *file;
struct file_lock *fl;
struct file_lock *flc;
- int (*notify)(void *, void *, int) = NULL;
+ int (*notify)(struct file_lock *fl, int result) = NULL;
struct plock_xop *xop = (struct plock_xop *)op;
int rv = 0;
@@ -209,7 +209,7 @@ static int dlm_plock_callback(struct plock_op *op)
notify = xop->callback;
if (op->info.rv) {
- notify(fl, NULL, op->info.rv);
+ notify(fl, op->info.rv);
goto out;
}
@@ -228,7 +228,7 @@ static int dlm_plock_callback(struct plock_op *op)
(unsigned long long)op->info.number, file, fl);
}
- rv = notify(fl, NULL, 0);
+ rv = notify(fl, 0);
if (rv) {
/* XXX: We need to cancel the fs lock here: */
log_print("dlm_plock_callback: lock granted after lock request "
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index db0fad3269c0..f5bce9096555 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -229,8 +229,8 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
if (rc) {
printk(KERN_ERR "%s: Error attempting to initialize "
"the lower file for the dentry with name "
- "[%s]; rc = [%d]\n", __func__,
- ecryptfs_dentry->d_name.name, rc);
+ "[%pd]; rc = [%d]\n", __func__,
+ ecryptfs_dentry, rc);
goto out_free;
}
if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE)
@@ -327,7 +327,7 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct file *lower_file = ecryptfs_file_to_lower(file);
long rc = -ENOIOCTLCMD;
- if (lower_file->f_op && lower_file->f_op->compat_ioctl)
+ if (lower_file->f_op->compat_ioctl)
rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg);
return rc;
}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index d4a9431ec73c..1686dc2da9fd 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -53,9 +53,7 @@ static void unlock_dir(struct dentry *dir)
static int ecryptfs_inode_test(struct inode *inode, void *lower_inode)
{
- if (ecryptfs_inode_to_lower(inode) == (struct inode *)lower_inode)
- return 1;
- return 0;
+ return ecryptfs_inode_to_lower(inode) == lower_inode;
}
static int ecryptfs_inode_set(struct inode *inode, void *opaque)
@@ -192,12 +190,6 @@ ecryptfs_do_create(struct inode *directory_inode,
lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
lower_dir_dentry = lock_parent(lower_dentry);
- if (IS_ERR(lower_dir_dentry)) {
- ecryptfs_printk(KERN_ERR, "Error locking directory of "
- "dentry\n");
- inode = ERR_CAST(lower_dir_dentry);
- goto out;
- }
rc = vfs_create(lower_dir_dentry->d_inode, lower_dentry, mode, true);
if (rc) {
printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
@@ -215,7 +207,6 @@ ecryptfs_do_create(struct inode *directory_inode,
fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode);
out_lock:
unlock_dir(lower_dir_dentry);
-out:
return inode;
}
@@ -250,8 +241,8 @@ int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry,
if (rc) {
printk(KERN_ERR "%s: Error attempting to initialize "
"the lower file for the dentry with name "
- "[%s]; rc = [%d]\n", __func__,
- ecryptfs_dentry->d_name.name, rc);
+ "[%pd]; rc = [%d]\n", __func__,
+ ecryptfs_dentry, rc);
goto out;
}
rc = ecryptfs_write_metadata(ecryptfs_dentry, ecryptfs_inode);
@@ -313,8 +304,8 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
if (rc) {
printk(KERN_ERR "%s: Error attempting to initialize "
"the lower file for the dentry with name "
- "[%s]; rc = [%d]\n", __func__,
- dentry->d_name.name, rc);
+ "[%pd]; rc = [%d]\n", __func__,
+ dentry, rc);
return rc;
}
@@ -418,8 +409,8 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
if (IS_ERR(lower_dentry)) {
rc = PTR_ERR(lower_dentry);
ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
- "[%d] on lower_dentry = [%s]\n", __func__, rc,
- ecryptfs_dentry->d_name.name);
+ "[%d] on lower_dentry = [%pd]\n", __func__, rc,
+ ecryptfs_dentry);
goto out;
}
if (lower_dentry->d_inode)
@@ -1039,7 +1030,7 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
}
rc = vfs_setxattr(lower_dentry, name, value, size, flags);
- if (!rc)
+ if (!rc && dentry->d_inode)
fsstack_copy_attr_all(dentry->d_inode, lower_dentry->d_inode);
out:
return rc;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 4725a07f003c..635e8e16a5b7 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -26,7 +26,6 @@
*/
#include <linux/string.h>
-#include <linux/syscalls.h>
#include <linux/pagemap.h>
#include <linux/key.h>
#include <linux/random.h>
@@ -1846,7 +1845,6 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
"(Tag 11 not allowed by itself)\n");
rc = -EIO;
goto out_wipe_list;
- break;
default:
ecryptfs_printk(KERN_DEBUG, "No packet at offset [%zd] "
"of the file header; hex value of "
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index e57380e5f6bd..286f10b0363b 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -434,8 +434,7 @@ void ecryptfs_release_messaging(void)
mutex_lock(&ecryptfs_msg_ctx_lists_mux);
for (i = 0; i < ecryptfs_message_buf_len; i++) {
mutex_lock(&ecryptfs_msg_ctx_arr[i].mux);
- if (ecryptfs_msg_ctx_arr[i].msg)
- kfree(ecryptfs_msg_ctx_arr[i].msg);
+ kfree(ecryptfs_msg_ctx_arr[i].msg);
mutex_unlock(&ecryptfs_msg_ctx_arr[i].mux);
}
kfree(ecryptfs_msg_ctx_arr);
diff --git a/fs/exec.c b/fs/exec.c
index a2b42a98c743..7302b75a9820 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1372,18 +1372,23 @@ int search_binary_handler(struct linux_binprm *bprm)
read_unlock(&binfmt_lock);
bprm->recursion_depth++;
retval = fmt->load_binary(bprm);
+ read_lock(&binfmt_lock);
+ put_binfmt(fmt);
bprm->recursion_depth--;
- if (retval >= 0 || retval != -ENOEXEC ||
- bprm->mm == NULL || bprm->file == NULL) {
- put_binfmt(fmt);
+ if (retval < 0 && !bprm->mm) {
+ /* we got to flush_old_exec() and failed after it */
+ read_unlock(&binfmt_lock);
+ force_sigsegv(SIGSEGV, current);
+ return retval;
+ }
+ if (retval != -ENOEXEC || !bprm->file) {
+ read_unlock(&binfmt_lock);
return retval;
}
- read_lock(&binfmt_lock);
- put_binfmt(fmt);
}
read_unlock(&binfmt_lock);
- if (need_retry && retval == -ENOEXEC) {
+ if (need_retry) {
if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
printable(bprm->buf[2]) && printable(bprm->buf[3]))
return retval;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index b88edc05c230..170dc41e8bf4 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1067,14 +1067,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
ext2_rsv_window_add(sb, &sbi->s_rsv_window_head);
err = percpu_counter_init(&sbi->s_freeblocks_counter,
- ext2_count_free_blocks(sb));
+ ext2_count_free_blocks(sb), GFP_KERNEL);
if (!err) {
err = percpu_counter_init(&sbi->s_freeinodes_counter,
- ext2_count_free_inodes(sb));
+ ext2_count_free_inodes(sb), GFP_KERNEL);
}
if (!err) {
err = percpu_counter_init(&sbi->s_dirs_counter,
- ext2_count_dirs(sb));
+ ext2_count_dirs(sb), GFP_KERNEL);
}
if (err) {
ext2_msg(sb, KERN_ERR, "error: insufficient memory");
diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h
index e85ff15a060e..fc3cdcf24aed 100644
--- a/fs/ext3/ext3.h
+++ b/fs/ext3/ext3.h
@@ -237,6 +237,8 @@ struct ext3_new_group_data {
#define EXT3_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
#define EXT3_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
+/* Number of supported quota types */
+#define EXT3_MAXQUOTAS 2
/*
* Mount options
@@ -248,7 +250,7 @@ struct ext3_mount_options {
unsigned long s_commit_interval;
#ifdef CONFIG_QUOTA
int s_jquota_fmt;
- char *s_qf_names[MAXQUOTAS];
+ char *s_qf_names[EXT3_MAXQUOTAS];
#endif
};
@@ -669,7 +671,7 @@ struct ext3_sb_info {
unsigned long s_commit_interval;
struct block_device *journal_bdev;
#ifdef CONFIG_QUOTA
- char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
+ char *s_qf_names[EXT3_MAXQUOTAS]; /* Names of quota files with journalled quota */
int s_jquota_fmt; /* Format of quota to use */
#endif
};
@@ -1183,9 +1185,9 @@ extern const struct inode_operations ext3_fast_symlink_inode_operations;
#define EXT3_QUOTA_INIT_BLOCKS(sb) 0
#define EXT3_QUOTA_DEL_BLOCKS(sb) 0
#endif
-#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb))
-#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb))
-#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb))
+#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb))
+#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb))
+#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb))
int
ext3_mark_iloc_dirty(handle_t *handle,
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 622e88249024..7015db0bafd1 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -441,7 +441,7 @@ static void ext3_put_super (struct super_block * sb)
percpu_counter_destroy(&sbi->s_dirs_counter);
brelse(sbi->s_sbh);
#ifdef CONFIG_QUOTA
- for (i = 0; i < MAXQUOTAS; i++)
+ for (i = 0; i < EXT3_MAXQUOTAS; i++)
kfree(sbi->s_qf_names[i]);
#endif
@@ -1555,7 +1555,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
/* Needed for iput() to work correctly and not trash data */
sb->s_flags |= MS_ACTIVE;
/* Turn on quotas so that they are updated correctly */
- for (i = 0; i < MAXQUOTAS; i++) {
+ for (i = 0; i < EXT3_MAXQUOTAS; i++) {
if (EXT3_SB(sb)->s_qf_names[i]) {
int ret = ext3_quota_on_mount(sb, i);
if (ret < 0)
@@ -1606,7 +1606,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
PLURAL(nr_truncates));
#ifdef CONFIG_QUOTA
/* Turn quotas off */
- for (i = 0; i < MAXQUOTAS; i++) {
+ for (i = 0; i < EXT3_MAXQUOTAS; i++) {
if (sb_dqopt(sb)->files[i])
dquot_quota_off(sb, i);
}
@@ -2039,14 +2039,14 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
goto failed_mount2;
}
err = percpu_counter_init(&sbi->s_freeblocks_counter,
- ext3_count_free_blocks(sb));
+ ext3_count_free_blocks(sb), GFP_KERNEL);
if (!err) {
err = percpu_counter_init(&sbi->s_freeinodes_counter,
- ext3_count_free_inodes(sb));
+ ext3_count_free_inodes(sb), GFP_KERNEL);
}
if (!err) {
err = percpu_counter_init(&sbi->s_dirs_counter,
- ext3_count_dirs(sb));
+ ext3_count_dirs(sb), GFP_KERNEL);
}
if (err) {
ext3_msg(sb, KERN_ERR, "error: insufficient memory");
@@ -2139,7 +2139,7 @@ failed_mount2:
kfree(sbi->s_group_desc);
failed_mount:
#ifdef CONFIG_QUOTA
- for (i = 0; i < MAXQUOTAS; i++)
+ for (i = 0; i < EXT3_MAXQUOTAS; i++)
kfree(sbi->s_qf_names[i]);
#endif
ext3_blkdev_remove(sbi);
@@ -2659,7 +2659,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
old_opts.s_commit_interval = sbi->s_commit_interval;
#ifdef CONFIG_QUOTA
old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
- for (i = 0; i < MAXQUOTAS; i++)
+ for (i = 0; i < EXT3_MAXQUOTAS; i++)
if (sbi->s_qf_names[i]) {
old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
GFP_KERNEL);
@@ -2763,7 +2763,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
}
#ifdef CONFIG_QUOTA
/* Release old quota file names */
- for (i = 0; i < MAXQUOTAS; i++)
+ for (i = 0; i < EXT3_MAXQUOTAS; i++)
kfree(old_opts.s_qf_names[i]);
#endif
if (enable_quota)
@@ -2777,7 +2777,7 @@ restore_opts:
sbi->s_commit_interval = old_opts.s_commit_interval;
#ifdef CONFIG_QUOTA
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
- for (i = 0; i < MAXQUOTAS; i++) {
+ for (i = 0; i < EXT3_MAXQUOTAS; i++) {
kfree(sbi->s_qf_names[i]);
sbi->s_qf_names[i] = old_opts.s_qf_names[i];
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0b28b36e7915..05c159218bc2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3892,7 +3892,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
/* Register extent status tree shrinker */
ext4_es_register_shrinker(sbi);
- if ((err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0)) != 0) {
+ err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL);
+ if (err) {
ext4_msg(sb, KERN_ERR, "insufficient memory");
goto failed_mount3;
}
@@ -4106,17 +4107,20 @@ no_journal:
block = ext4_count_free_clusters(sb);
ext4_free_blocks_count_set(sbi->s_es,
EXT4_C2B(sbi, block));
- err = percpu_counter_init(&sbi->s_freeclusters_counter, block);
+ err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
+ GFP_KERNEL);
if (!err) {
unsigned long freei = ext4_count_free_inodes(sb);
sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
- err = percpu_counter_init(&sbi->s_freeinodes_counter, freei);
+ err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
+ GFP_KERNEL);
}
if (!err)
err = percpu_counter_init(&sbi->s_dirs_counter,
- ext4_count_dirs(sb));
+ ext4_count_dirs(sb), GFP_KERNEL);
if (!err)
- err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0);
+ err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
+ GFP_KERNEL);
if (err) {
ext4_msg(sb, KERN_ERR, "insufficient memory");
goto failed_mount6;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index ec3b7a5381fa..dd10a031c052 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -72,7 +72,22 @@ out:
return page;
}
-static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
+struct page *get_meta_page_ra(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+ bool readahead = false;
+ struct page *page;
+
+ page = find_get_page(META_MAPPING(sbi), index);
+ if (!page || (page && !PageUptodate(page)))
+ readahead = true;
+ f2fs_put_page(page, 0);
+
+ if (readahead)
+ ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR);
+ return get_meta_page(sbi, index);
+}
+
+static inline block_t get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
{
switch (type) {
case META_NAT:
@@ -82,6 +97,8 @@ static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
case META_SSA:
case META_CP:
return 0;
+ case META_POR:
+ return MAX_BLKADDR(sbi);
default:
BUG();
}
@@ -90,12 +107,12 @@ static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
/*
* Readahead CP/NAT/SIT/SSA pages
*/
-int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type)
+int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type)
{
block_t prev_blk_addr = 0;
struct page *page;
- int blkno = start;
- int max_blks = get_max_meta_blks(sbi, type);
+ block_t blkno = start;
+ block_t max_blks = get_max_meta_blks(sbi, type);
struct f2fs_io_info fio = {
.type = META,
@@ -125,7 +142,11 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type)
break;
case META_SSA:
case META_CP:
- /* get ssa/cp block addr */
+ case META_POR:
+ if (unlikely(blkno >= max_blks))
+ goto out;
+ if (unlikely(blkno < SEG0_BLKADDR(sbi)))
+ goto out;
blk_addr = blkno;
break;
default:
@@ -151,8 +172,7 @@ out:
static int f2fs_write_meta_page(struct page *page,
struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_P_SB(page);
trace_f2fs_writepage(page, META);
@@ -177,7 +197,7 @@ redirty_out:
static int f2fs_write_meta_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
long diff, written;
trace_f2fs_writepages(mapping->host, wbc, META);
@@ -259,15 +279,12 @@ continue_unlock:
static int f2fs_set_meta_page_dirty(struct page *page)
{
- struct address_space *mapping = page->mapping;
- struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
-
trace_f2fs_set_page_dirty(page, META);
SetPageUptodate(page);
if (!PageDirty(page)) {
__set_page_dirty_nobuffers(page);
- inc_page_count(sbi, F2FS_DIRTY_META);
+ inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
return 1;
}
return 0;
@@ -378,7 +395,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi)
void release_orphan_inode(struct f2fs_sb_info *sbi)
{
spin_lock(&sbi->ino_lock[ORPHAN_INO]);
- f2fs_bug_on(sbi->n_orphans == 0);
+ f2fs_bug_on(sbi, sbi->n_orphans == 0);
sbi->n_orphans--;
spin_unlock(&sbi->ino_lock[ORPHAN_INO]);
}
@@ -398,7 +415,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
{
struct inode *inode = f2fs_iget(sbi->sb, ino);
- f2fs_bug_on(IS_ERR(inode));
+ f2fs_bug_on(sbi, IS_ERR(inode));
clear_nlink(inode);
/* truncate all the data during iput */
@@ -459,7 +476,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
list_for_each_entry(orphan, head, list) {
if (!page) {
page = find_get_page(META_MAPPING(sbi), start_blk++);
- f2fs_bug_on(!page);
+ f2fs_bug_on(sbi, !page);
orphan_blk =
(struct f2fs_orphan_block *)page_address(page);
memset(orphan_blk, 0, sizeof(*orphan_blk));
@@ -619,7 +636,7 @@ fail_no_cp:
static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR))
return -EEXIST;
@@ -631,32 +648,38 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
return 0;
}
-void set_dirty_dir_page(struct inode *inode, struct page *page)
+void update_dirty_page(struct inode *inode, struct page *page)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dir_inode_entry *new;
int ret = 0;
- if (!S_ISDIR(inode->i_mode))
+ if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
return;
+ if (!S_ISDIR(inode->i_mode)) {
+ inode_inc_dirty_pages(inode);
+ goto out;
+ }
+
new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
new->inode = inode;
INIT_LIST_HEAD(&new->list);
spin_lock(&sbi->dir_inode_lock);
ret = __add_dirty_inode(inode, new);
- inode_inc_dirty_dents(inode);
- SetPagePrivate(page);
+ inode_inc_dirty_pages(inode);
spin_unlock(&sbi->dir_inode_lock);
if (ret)
kmem_cache_free(inode_entry_slab, new);
+out:
+ SetPagePrivate(page);
}
void add_dirty_dir_inode(struct inode *inode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dir_inode_entry *new =
f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
int ret = 0;
@@ -674,14 +697,14 @@ void add_dirty_dir_inode(struct inode *inode)
void remove_dirty_dir_inode(struct inode *inode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dir_inode_entry *entry;
if (!S_ISDIR(inode->i_mode))
return;
spin_lock(&sbi->dir_inode_lock);
- if (get_dirty_dents(inode) ||
+ if (get_dirty_pages(inode) ||
!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) {
spin_unlock(&sbi->dir_inode_lock);
return;
@@ -802,11 +825,12 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
finish_wait(&sbi->cp_wait, &wait);
}
-static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
+static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
- nid_t last_nid = 0;
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ nid_t last_nid = nm_i->next_scan_nid;
block_t start_blk;
struct page *cp_page;
unsigned int data_sum_blocks, orphan_blocks;
@@ -869,7 +893,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
orphan_blocks);
- if (is_umount) {
+ if (cpc->reason == CP_UMOUNT) {
set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+
cp_payload_blks + data_sum_blocks +
@@ -886,6 +910,9 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
else
clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
+ if (sbi->need_fsck)
+ set_ckpt_flags(ckpt, CP_FSCK_FLAG);
+
/* update SIT/NAT bitmap */
get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
@@ -920,7 +947,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
write_data_summaries(sbi, start_blk);
start_blk += data_sum_blocks;
- if (is_umount) {
+ if (cpc->reason == CP_UMOUNT) {
write_node_summaries(sbi, start_blk);
start_blk += NR_CURSEG_NODE_TYPE;
}
@@ -960,23 +987,23 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
/*
* We guarantee that this checkpoint procedure will not fail.
*/
-void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
+void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
unsigned long long ckpt_ver;
- trace_f2fs_write_checkpoint(sbi->sb, is_umount, "start block_ops");
+ trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
mutex_lock(&sbi->cp_mutex);
- if (!sbi->s_dirty)
+ if (!sbi->s_dirty && cpc->reason != CP_DISCARD)
goto out;
if (unlikely(f2fs_cp_error(sbi)))
goto out;
if (block_operations(sbi))
goto out;
- trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops");
+ trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops");
f2fs_submit_merged_bio(sbi, DATA, WRITE);
f2fs_submit_merged_bio(sbi, NODE, WRITE);
@@ -992,16 +1019,16 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
/* write cached NAT/SIT entries to NAT/SIT area */
flush_nat_entries(sbi);
- flush_sit_entries(sbi);
+ flush_sit_entries(sbi, cpc);
/* unlock all the fs_lock[] in do_checkpoint() */
- do_checkpoint(sbi, is_umount);
+ do_checkpoint(sbi, cpc);
unblock_operations(sbi);
stat_inc_cp_count(sbi->stat_info);
out:
mutex_unlock(&sbi->cp_mutex);
- trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint");
+ trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
}
void init_ino_entry_info(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 76de83e25a89..8e58c4cc2cb9 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -85,7 +85,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
bio = bio_alloc(GFP_NOIO, npages);
bio->bi_bdev = sbi->sb->s_bdev;
- bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
+ bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
bio->bi_private = sbi;
@@ -193,7 +193,7 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
__submit_merged_bio(io);
alloc_new:
if (io->bio == NULL) {
- int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+ int bio_blocks = MAX_BIO_BLOCKS(sbi);
io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read);
io->fio = *fio;
@@ -236,7 +236,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
int reserve_new_block(struct dnode_of_data *dn)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
return -EPERM;
@@ -258,7 +258,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
int err;
/* if inode_page exists, index should be zero */
- f2fs_bug_on(!need_put && index);
+ f2fs_bug_on(F2FS_I_SB(dn->inode), !need_put && index);
err = get_dnode_of_data(dn, index, ALLOC_NODE);
if (err)
@@ -321,7 +321,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
block_t start_blkaddr, end_blkaddr;
int need_update = true;
- f2fs_bug_on(blk_addr == NEW_ADDR);
+ f2fs_bug_on(F2FS_I_SB(dn->inode), blk_addr == NEW_ADDR);
fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
dn->ofs_in_node;
@@ -396,7 +396,6 @@ end_update:
struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct address_space *mapping = inode->i_mapping;
struct dnode_of_data dn;
struct page *page;
@@ -429,7 +428,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
return page;
}
- err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
+ err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, dn.data_blkaddr,
sync ? READ_SYNC : READA);
if (err)
return ERR_PTR(err);
@@ -451,7 +450,6 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
*/
struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct address_space *mapping = inode->i_mapping;
struct dnode_of_data dn;
struct page *page;
@@ -490,7 +488,8 @@ repeat:
return page;
}
- err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC);
+ err = f2fs_submit_page_bio(F2FS_I_SB(inode), page,
+ dn.data_blkaddr, READ_SYNC);
if (err)
return ERR_PTR(err);
@@ -517,7 +516,6 @@ repeat:
struct page *get_new_data_page(struct inode *inode,
struct page *ipage, pgoff_t index, bool new_i_size)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct address_space *mapping = inode->i_mapping;
struct page *page;
struct dnode_of_data dn;
@@ -541,8 +539,8 @@ repeat:
zero_user_segment(page, 0, PAGE_CACHE_SIZE);
SetPageUptodate(page);
} else {
- err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
- READ_SYNC);
+ err = f2fs_submit_page_bio(F2FS_I_SB(inode), page,
+ dn.data_blkaddr, READ_SYNC);
if (err)
goto put_err;
@@ -573,10 +571,12 @@ put_err:
static int __allocate_data_block(struct dnode_of_data *dn)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+ struct f2fs_inode_info *fi = F2FS_I(dn->inode);
struct f2fs_summary sum;
block_t new_blkaddr;
struct node_info ni;
+ pgoff_t fofs;
int type;
if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
@@ -599,6 +599,12 @@ static int __allocate_data_block(struct dnode_of_data *dn)
update_extent_cache(new_blkaddr, dn);
clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
+ /* update i_size */
+ fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
+ dn->ofs_in_node;
+ if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT))
+ i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT));
+
dn->data_blkaddr = new_blkaddr;
return 0;
}
@@ -614,7 +620,6 @@ static int __allocate_data_block(struct dnode_of_data *dn)
static int __get_data_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create, bool fiemap)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
unsigned int blkbits = inode->i_sb->s_blocksize_bits;
unsigned maxblocks = bh_result->b_size >> blkbits;
struct dnode_of_data dn;
@@ -630,8 +635,8 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
goto out;
if (create) {
- f2fs_balance_fs(sbi);
- f2fs_lock_op(sbi);
+ f2fs_balance_fs(F2FS_I_SB(inode));
+ f2fs_lock_op(F2FS_I_SB(inode));
}
/* When reading holes, we need its node page */
@@ -707,7 +712,7 @@ put_out:
f2fs_put_dnode(&dn);
unlock_out:
if (create)
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(F2FS_I_SB(inode));
out:
trace_f2fs_get_data_block(inode, iblock, bh_result, err);
return err;
@@ -804,7 +809,7 @@ static int f2fs_write_data_page(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
loff_t i_size = i_size_read(inode);
const pgoff_t end_index = ((unsigned long long) i_size)
>> PAGE_CACHE_SHIFT;
@@ -846,7 +851,7 @@ write:
if (unlikely(f2fs_cp_error(sbi))) {
SetPageError(page);
unlock_page(page);
- return 0;
+ goto out;
}
if (!wbc->for_reclaim)
@@ -866,7 +871,7 @@ done:
clear_cold_data(page);
out:
- inode_dec_dirty_dents(inode);
+ inode_dec_dirty_pages(inode);
unlock_page(page);
if (need_balance_fs)
f2fs_balance_fs(sbi);
@@ -892,7 +897,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
bool locked = false;
int ret;
long diff;
@@ -904,7 +909,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
return 0;
if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE &&
- get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA) &&
+ get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) &&
available_free_memory(sbi, DIRTY_DENTS))
goto skip_write;
@@ -926,7 +931,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
return ret;
skip_write:
- wbc->pages_skipped += get_dirty_dents(inode);
+ wbc->pages_skipped += get_dirty_pages(inode);
return 0;
}
@@ -945,7 +950,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *page;
pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
struct dnode_of_data dn;
@@ -1047,7 +1052,10 @@ static int f2fs_write_end(struct file *file,
trace_f2fs_write_end(inode, pos, len, copied);
- set_page_dirty(page);
+ if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
+ register_inmem_page(inode, page);
+ else
+ set_page_dirty(page);
if (pos + copied > i_size_read(inode)) {
i_size_write(inode, pos + copied);
@@ -1092,9 +1100,6 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
if (check_direct_IO(inode, rw, iter, offset))
return 0;
- /* clear fsync mark to recover these blocks */
- fsync_mark_clear(F2FS_SB(inode->i_sb), inode->i_ino);
-
trace_f2fs_direct_IO_enter(inode, offset, count, rw);
err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block);
@@ -1110,8 +1115,12 @@ static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
unsigned int length)
{
struct inode *inode = page->mapping->host;
+
+ if (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE)
+ return;
+
if (PageDirty(page))
- inode_dec_dirty_dents(inode);
+ inode_dec_dirty_pages(inode);
ClearPagePrivate(page);
}
@@ -1133,7 +1142,7 @@ static int f2fs_set_data_page_dirty(struct page *page)
if (!PageDirty(page)) {
__set_page_dirty_nobuffers(page);
- set_dirty_dir_page(inode, page);
+ update_dirty_page(inode, page);
return 1;
}
return 0;
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index fecebdbfd781..0a91ab813a9e 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -93,7 +93,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
total_vblocks = 0;
blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg);
hblks_per_sec = blks_per_sec / 2;
- for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) {
+ for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
dist = abs(vblocks - hblks_per_sec);
bimodal += dist * dist;
@@ -103,7 +103,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
ndirty++;
}
}
- dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100;
+ dist = MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100;
si->bimodal = bimodal / dist;
if (si->dirty_count)
si->avg_vblocks = total_vblocks / ndirty;
@@ -131,17 +131,17 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
/* build sit */
si->base_mem += sizeof(struct sit_info);
- si->base_mem += TOTAL_SEGS(sbi) * sizeof(struct seg_entry);
- si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi));
- si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * TOTAL_SEGS(sbi);
+ si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry);
+ si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
+ si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
if (sbi->segs_per_sec > 1)
- si->base_mem += TOTAL_SECS(sbi) * sizeof(struct sec_entry);
+ si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry);
si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
/* build free segmap */
si->base_mem += sizeof(struct free_segmap_info);
- si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi));
- si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi));
+ si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
+ si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi));
/* build curseg */
si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE;
@@ -149,8 +149,8 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
/* build dirty segmap */
si->base_mem += sizeof(struct dirty_seglist_info);
- si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi));
- si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi));
+ si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(MAIN_SEGS(sbi));
+ si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi));
/* build nm */
si->base_mem += sizeof(struct f2fs_nm_info);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 155fb056b7f1..b54f87149c09 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -126,7 +126,7 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
* For the most part, it should be a bug when name_len is zero.
* We stop here for figuring out where the bugs has occurred.
*/
- f2fs_bug_on(!de->name_len);
+ f2fs_bug_on(F2FS_P_SB(dentry_page), !de->name_len);
bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
}
@@ -151,7 +151,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
bool room = false;
int max_slots = 0;
- f2fs_bug_on(level > MAX_DIR_HASH_DEPTH);
+ f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH);
nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
nblock = bucket_blocks(level);
@@ -284,10 +284,9 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
int update_dent_inode(struct inode *inode, const struct qstr *name)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct page *page;
- page = get_node_page(sbi, inode->i_ino);
+ page = get_node_page(F2FS_I_SB(inode), inode->i_ino);
if (IS_ERR(page))
return PTR_ERR(page);
@@ -337,7 +336,6 @@ static int make_empty_dir(struct inode *inode,
static struct page *init_inode_metadata(struct inode *inode,
struct inode *dir, const struct qstr *name)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
struct page *page;
int err;
@@ -360,7 +358,7 @@ static struct page *init_inode_metadata(struct inode *inode,
if (err)
goto put_error;
} else {
- page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
+ page = get_node_page(F2FS_I_SB(dir), inode->i_ino);
if (IS_ERR(page))
return page;
@@ -381,7 +379,7 @@ static struct page *init_inode_metadata(struct inode *inode,
* we should remove this inode from orphan list.
*/
if (inode->i_nlink == 0)
- remove_orphan_inode(sbi, inode->i_ino);
+ remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino);
inc_nlink(inode);
}
return page;
@@ -571,8 +569,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
{
struct f2fs_dentry_block *dentry_blk;
unsigned int bit_pos;
- struct address_space *mapping = page->mapping;
- struct inode *dir = mapping->host;
+ struct inode *dir = page->mapping->host;
int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
int i;
@@ -594,7 +591,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
if (inode) {
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
down_write(&F2FS_I(inode)->i_sem);
@@ -621,7 +618,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
truncate_hole(dir, page->index, page->index + 1);
clear_page_dirty_for_io(page);
ClearPageUptodate(page);
- inode_dec_dirty_dents(dir);
+ inode_dec_dirty_pages(dir);
}
f2fs_put_page(page, 1);
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index e921242186f6..8171e80b2ee9 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -21,10 +21,16 @@
#include <linux/sched.h>
#ifdef CONFIG_F2FS_CHECK_FS
-#define f2fs_bug_on(condition) BUG_ON(condition)
+#define f2fs_bug_on(sbi, condition) BUG_ON(condition)
#define f2fs_down_write(x, y) down_write_nest_lock(x, y)
#else
-#define f2fs_bug_on(condition) WARN_ON(condition)
+#define f2fs_bug_on(sbi, condition) \
+ do { \
+ if (unlikely(condition)) { \
+ WARN_ON(1); \
+ sbi->need_fsck = true; \
+ } \
+ } while (0)
#define f2fs_down_write(x, y) down_write(x)
#endif
@@ -90,6 +96,20 @@ enum {
SIT_BITMAP
};
+enum {
+ CP_UMOUNT,
+ CP_SYNC,
+ CP_DISCARD,
+};
+
+struct cp_control {
+ int reason;
+ __u64 trim_start;
+ __u64 trim_end;
+ __u64 trim_minlen;
+ __u64 trimmed;
+};
+
/*
* For CP/NAT/SIT/SSA readahead
*/
@@ -97,7 +117,8 @@ enum {
META_CP,
META_NAT,
META_SIT,
- META_SSA
+ META_SSA,
+ META_POR,
};
/* for the list of ino */
@@ -130,7 +151,9 @@ struct discard_entry {
struct fsync_inode_entry {
struct list_head list; /* list head */
struct inode *inode; /* vfs inode pointer */
- block_t blkaddr; /* block address locating the last inode */
+ block_t blkaddr; /* block address locating the last fsync */
+ block_t last_dentry; /* block address locating the last dentry */
+ block_t last_inode; /* block address locating the last inode */
};
#define nats_in_cursum(sum) (le16_to_cpu(sum->n_nats))
@@ -141,6 +164,9 @@ struct fsync_inode_entry {
#define sit_in_journal(sum, i) (sum->sit_j.entries[i].se)
#define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno)
+#define MAX_NAT_JENTRIES(sum) (NAT_JOURNAL_ENTRIES - nats_in_cursum(sum))
+#define MAX_SIT_JENTRIES(sum) (SIT_JOURNAL_ENTRIES - sits_in_cursum(sum))
+
static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i)
{
int before = nats_in_cursum(rs);
@@ -155,11 +181,24 @@ static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i)
return before;
}
+static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
+ int type)
+{
+ if (type == NAT_JOURNAL)
+ return size <= MAX_NAT_JENTRIES(sum);
+ return size <= MAX_SIT_JENTRIES(sum);
+}
+
/*
* ioctl commands
*/
-#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS
-#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS
+#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS
+#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS
+
+#define F2FS_IOCTL_MAGIC 0xf5
+#define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1)
+#define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2)
+#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3)
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
@@ -222,13 +261,16 @@ struct f2fs_inode_info {
/* Use below internally in f2fs*/
unsigned long flags; /* use to pass per-file flags */
struct rw_semaphore i_sem; /* protect fi info */
- atomic_t dirty_dents; /* # of dirty dentry pages */
+ atomic_t dirty_pages; /* # of dirty pages */
f2fs_hash_t chash; /* hash value of given file name */
unsigned int clevel; /* maximum level of given file name */
nid_t i_xattr_nid; /* node id that contains xattrs */
unsigned long long xattr_ver; /* cp version of xattr modification */
struct extent_info ext; /* in-memory extent cache entry */
struct dir_inode_entry *dirty_dir; /* the pointer of dirty dir */
+
+ struct list_head inmem_pages; /* inmemory pages managed by f2fs */
+ struct mutex inmem_lock; /* lock for inmemory pages */
};
static inline void get_extent_info(struct extent_info *ext,
@@ -260,11 +302,10 @@ struct f2fs_nm_info {
/* NAT cache management */
struct radix_tree_root nat_root;/* root of the nat entry cache */
+ struct radix_tree_root nat_set_root;/* root of the nat set cache */
rwlock_t nat_tree_lock; /* protect nat_tree_lock */
- unsigned int nat_cnt; /* the # of cached nat entries */
struct list_head nat_entries; /* cached nat entry list (clean) */
- struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */
- struct list_head nat_entry_set; /* nat entry set list */
+ unsigned int nat_cnt; /* the # of cached nat entries */
unsigned int dirty_nat_cnt; /* total num of nat entries in set */
/* free node ids management */
@@ -332,18 +373,16 @@ enum {
};
struct flush_cmd {
- struct flush_cmd *next;
struct completion wait;
+ struct llist_node llnode;
int ret;
};
struct flush_cmd_control {
struct task_struct *f2fs_issue_flush; /* flush thread */
wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */
- struct flush_cmd *issue_list; /* list for command issue */
- struct flush_cmd *dispatch_list; /* list for command dispatch */
- spinlock_t issue_lock; /* for issue list lock */
- struct flush_cmd *issue_tail; /* list tail of issue list */
+ struct llist_head issue_list; /* list for command issue */
+ struct llist_node *dispatch_list; /* list for command dispatch */
};
struct f2fs_sm_info {
@@ -369,8 +408,11 @@ struct f2fs_sm_info {
int nr_discards; /* # of discards in the list */
int max_discards; /* max. discards to be issued */
+ struct list_head sit_entry_set; /* sit entry set list */
+
unsigned int ipu_policy; /* in-place-update policy */
unsigned int min_ipu_util; /* in-place-update threshold */
+ unsigned int min_fsync_blocks; /* threshold for fsync */
/* for flush command control */
struct flush_cmd_control *cmd_control_info;
@@ -434,6 +476,7 @@ struct f2fs_sb_info {
struct buffer_head *raw_super_buf; /* buffer head of raw sb */
struct f2fs_super_block *raw_super; /* raw super block pointer */
int s_dirty; /* dirty flag for checkpoint */
+ bool need_fsck; /* need fsck.f2fs to fix */
/* for node-related operations */
struct f2fs_nm_info *nm_info; /* node manager */
@@ -539,6 +582,21 @@ static inline struct f2fs_sb_info *F2FS_SB(struct super_block *sb)
return sb->s_fs_info;
}
+static inline struct f2fs_sb_info *F2FS_I_SB(struct inode *inode)
+{
+ return F2FS_SB(inode->i_sb);
+}
+
+static inline struct f2fs_sb_info *F2FS_M_SB(struct address_space *mapping)
+{
+ return F2FS_I_SB(mapping->host);
+}
+
+static inline struct f2fs_sb_info *F2FS_P_SB(struct page *page)
+{
+ return F2FS_M_SB(page->mapping);
+}
+
static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi)
{
return (struct f2fs_super_block *)(sbi->raw_super);
@@ -703,8 +761,8 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
blkcnt_t count)
{
spin_lock(&sbi->stat_lock);
- f2fs_bug_on(sbi->total_valid_block_count < (block_t) count);
- f2fs_bug_on(inode->i_blocks < count);
+ f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count);
+ f2fs_bug_on(sbi, inode->i_blocks < count);
inode->i_blocks -= count;
sbi->total_valid_block_count -= (block_t)count;
spin_unlock(&sbi->stat_lock);
@@ -716,10 +774,11 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
F2FS_SET_SB_DIRT(sbi);
}
-static inline void inode_inc_dirty_dents(struct inode *inode)
+static inline void inode_inc_dirty_pages(struct inode *inode)
{
- inc_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS);
- atomic_inc(&F2FS_I(inode)->dirty_dents);
+ atomic_inc(&F2FS_I(inode)->dirty_pages);
+ if (S_ISDIR(inode->i_mode))
+ inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
}
static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
@@ -727,13 +786,15 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
atomic_dec(&sbi->nr_pages[count_type]);
}
-static inline void inode_dec_dirty_dents(struct inode *inode)
+static inline void inode_dec_dirty_pages(struct inode *inode)
{
- if (!S_ISDIR(inode->i_mode))
+ if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
return;
- dec_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS);
- atomic_dec(&F2FS_I(inode)->dirty_dents);
+ atomic_dec(&F2FS_I(inode)->dirty_pages);
+
+ if (S_ISDIR(inode->i_mode))
+ dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
}
static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
@@ -741,9 +802,9 @@ static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
return atomic_read(&sbi->nr_pages[count_type]);
}
-static inline int get_dirty_dents(struct inode *inode)
+static inline int get_dirty_pages(struct inode *inode)
{
- return atomic_read(&F2FS_I(inode)->dirty_dents);
+ return atomic_read(&F2FS_I(inode)->dirty_pages);
}
static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
@@ -848,9 +909,9 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
{
spin_lock(&sbi->stat_lock);
- f2fs_bug_on(!sbi->total_valid_block_count);
- f2fs_bug_on(!sbi->total_valid_node_count);
- f2fs_bug_on(!inode->i_blocks);
+ f2fs_bug_on(sbi, !sbi->total_valid_block_count);
+ f2fs_bug_on(sbi, !sbi->total_valid_node_count);
+ f2fs_bug_on(sbi, !inode->i_blocks);
inode->i_blocks--;
sbi->total_valid_node_count--;
@@ -867,7 +928,7 @@ static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
{
spin_lock(&sbi->stat_lock);
- f2fs_bug_on(sbi->total_valid_inode_count == sbi->total_node_count);
+ f2fs_bug_on(sbi, sbi->total_valid_inode_count == sbi->total_node_count);
sbi->total_valid_inode_count++;
spin_unlock(&sbi->stat_lock);
}
@@ -875,7 +936,7 @@ static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi)
{
spin_lock(&sbi->stat_lock);
- f2fs_bug_on(!sbi->total_valid_inode_count);
+ f2fs_bug_on(sbi, !sbi->total_valid_inode_count);
sbi->total_valid_inode_count--;
spin_unlock(&sbi->stat_lock);
}
@@ -891,7 +952,7 @@ static inline void f2fs_put_page(struct page *page, int unlock)
return;
if (unlock) {
- f2fs_bug_on(!PageLocked(page));
+ f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page));
unlock_page(page);
}
page_cache_release(page);
@@ -998,7 +1059,9 @@ enum {
FI_INLINE_DATA, /* used for inline data*/
FI_APPEND_WRITE, /* inode has appended data */
FI_UPDATE_WRITE, /* inode has in-place-update data */
- FI_NEED_IPU, /* used fo ipu for fdatasync */
+ FI_NEED_IPU, /* used for ipu per file */
+ FI_ATOMIC_FILE, /* indicate atomic file */
+ FI_VOLATILE_FILE, /* indicate volatile file */
};
static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -1085,6 +1148,16 @@ static inline int f2fs_has_inline_data(struct inode *inode)
return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA);
}
+static inline bool f2fs_is_atomic_file(struct inode *inode)
+{
+ return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE);
+}
+
+static inline bool f2fs_is_volatile_file(struct inode *inode)
+{
+ return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE);
+}
+
static inline void *inline_data_addr(struct page *page)
{
struct f2fs_inode *ri = F2FS_INODE(page);
@@ -1141,6 +1214,7 @@ void update_inode(struct inode *, struct page *);
void update_inode_page(struct inode *);
int f2fs_write_inode(struct inode *, struct writeback_control *);
void f2fs_evict_inode(struct inode *);
+void handle_failed_inode(struct inode *);
/*
* namei.c
@@ -1188,9 +1262,9 @@ struct dnode_of_data;
struct node_info;
bool available_free_memory(struct f2fs_sb_info *, int);
-int is_checkpointed_node(struct f2fs_sb_info *, nid_t);
-bool fsync_mark_done(struct f2fs_sb_info *, nid_t);
-void fsync_mark_clear(struct f2fs_sb_info *, nid_t);
+bool is_checkpointed_node(struct f2fs_sb_info *, nid_t);
+bool has_fsynced_inode(struct f2fs_sb_info *, nid_t);
+bool need_inode_block_update(struct f2fs_sb_info *, nid_t);
void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
int truncate_inode_blocks(struct inode *, pgoff_t);
@@ -1221,6 +1295,8 @@ void destroy_node_manager_caches(void);
/*
* segment.c
*/
+void register_inmem_page(struct inode *, struct page *);
+void commit_inmem_pages(struct inode *, bool);
void f2fs_balance_fs(struct f2fs_sb_info *);
void f2fs_balance_fs_bg(struct f2fs_sb_info *);
int f2fs_issue_flush(struct f2fs_sb_info *);
@@ -1229,9 +1305,11 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *);
void invalidate_blocks(struct f2fs_sb_info *, block_t);
void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
void clear_prefree_segments(struct f2fs_sb_info *);
+void release_discard_addrs(struct f2fs_sb_info *);
void discard_next_dnode(struct f2fs_sb_info *, block_t);
int npages_for_summary_flush(struct f2fs_sb_info *);
void allocate_new_segments(struct f2fs_sb_info *);
+int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *);
struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
void write_meta_page(struct f2fs_sb_info *, struct page *);
void write_node_page(struct f2fs_sb_info *, struct page *,
@@ -1248,7 +1326,7 @@ void write_data_summaries(struct f2fs_sb_info *, block_t);
void write_node_summaries(struct f2fs_sb_info *, block_t);
int lookup_journal_in_cursum(struct f2fs_summary_block *,
int, unsigned int, int);
-void flush_sit_entries(struct f2fs_sb_info *);
+void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *);
int build_segment_manager(struct f2fs_sb_info *);
void destroy_segment_manager(struct f2fs_sb_info *);
int __init create_segment_manager_caches(void);
@@ -1259,7 +1337,8 @@ void destroy_segment_manager_caches(void);
*/
struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
-int ra_meta_pages(struct f2fs_sb_info *, int, int, int);
+struct page *get_meta_page_ra(struct f2fs_sb_info *, pgoff_t);
+int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int);
long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
@@ -1271,11 +1350,11 @@ void add_orphan_inode(struct f2fs_sb_info *, nid_t);
void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
void recover_orphan_inodes(struct f2fs_sb_info *);
int get_valid_checkpoint(struct f2fs_sb_info *);
-void set_dirty_dir_page(struct inode *, struct page *);
+void update_dirty_page(struct inode *, struct page *);
void add_dirty_dir_inode(struct inode *);
void remove_dirty_dir_inode(struct inode *);
void sync_dirty_dir_inodes(struct f2fs_sb_info *);
-void write_checkpoint(struct f2fs_sb_info *, bool);
+void write_checkpoint(struct f2fs_sb_info *, struct cp_control *);
void init_ino_entry_info(struct f2fs_sb_info *);
int __init create_checkpoint_caches(void);
void destroy_checkpoint_caches(void);
@@ -1359,12 +1438,12 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
#define stat_inc_inline_inode(inode) \
do { \
if (f2fs_has_inline_data(inode)) \
- ((F2FS_SB(inode->i_sb))->inline_inode++); \
+ ((F2FS_I_SB(inode))->inline_inode++); \
} while (0)
#define stat_dec_inline_inode(inode) \
do { \
if (f2fs_has_inline_data(inode)) \
- ((F2FS_SB(inode->i_sb))->inline_inode--); \
+ ((F2FS_I_SB(inode))->inline_inode--); \
} while (0)
#define stat_inc_seg_type(sbi, curseg) \
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 060aee65aee8..8e68bb64f835 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -33,7 +33,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
{
struct page *page = vmf->page;
struct inode *inode = file_inode(vma->vm_file);
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dnode_of_data dn;
int err;
@@ -117,7 +117,7 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
static inline bool need_do_checkpoint(struct inode *inode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
bool need_cp = false;
if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
@@ -138,7 +138,8 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;
struct f2fs_inode_info *fi = F2FS_I(inode);
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ nid_t ino = inode->i_ino;
int ret = 0;
bool need_cp = false;
struct writeback_control wbc = {
@@ -153,12 +154,11 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trace_f2fs_sync_file_enter(inode);
/* if fdatasync is triggered, let's do in-place-update */
- if (datasync)
+ if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
set_inode_flag(fi, FI_NEED_IPU);
-
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (datasync)
- clear_inode_flag(fi, FI_NEED_IPU);
+ clear_inode_flag(fi, FI_NEED_IPU);
+
if (ret) {
trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
return ret;
@@ -168,13 +168,22 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* if there is no written data, don't waste time to write recovery info.
*/
if (!is_inode_flag_set(fi, FI_APPEND_WRITE) &&
- !exist_written_data(sbi, inode->i_ino, APPEND_INO)) {
+ !exist_written_data(sbi, ino, APPEND_INO)) {
+ struct page *i = find_get_page(NODE_MAPPING(sbi), ino);
+
+ /* But we need to avoid that there are some inode updates */
+ if ((i && PageDirty(i)) || need_inode_block_update(sbi, ino)) {
+ f2fs_put_page(i, 0);
+ goto go_write;
+ }
+ f2fs_put_page(i, 0);
+
if (is_inode_flag_set(fi, FI_UPDATE_WRITE) ||
- exist_written_data(sbi, inode->i_ino, UPDATE_INO))
+ exist_written_data(sbi, ino, UPDATE_INO))
goto flush_out;
goto out;
}
-
+go_write:
/* guarantee free sections for fsync */
f2fs_balance_fs(sbi);
@@ -207,26 +216,28 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
up_write(&fi->i_sem);
}
} else {
- /* if there is no written node page, write its inode page */
- while (!sync_node_pages(sbi, inode->i_ino, &wbc)) {
- if (fsync_mark_done(sbi, inode->i_ino))
- goto out;
+sync_nodes:
+ sync_node_pages(sbi, ino, &wbc);
+
+ if (need_inode_block_update(sbi, ino)) {
mark_inode_dirty_sync(inode);
ret = f2fs_write_inode(inode, NULL);
if (ret)
goto out;
+ goto sync_nodes;
}
- ret = wait_on_node_pages_writeback(sbi, inode->i_ino);
+
+ ret = wait_on_node_pages_writeback(sbi, ino);
if (ret)
goto out;
/* once recovery info is written, don't need to tack this */
- remove_dirty_inode(sbi, inode->i_ino, APPEND_INO);
+ remove_dirty_inode(sbi, ino, APPEND_INO);
clear_inode_flag(fi, FI_APPEND_WRITE);
flush_out:
- remove_dirty_inode(sbi, inode->i_ino, UPDATE_INO);
+ remove_dirty_inode(sbi, ino, UPDATE_INO);
clear_inode_flag(fi, FI_UPDATE_WRITE);
- ret = f2fs_issue_flush(F2FS_SB(inode->i_sb));
+ ret = f2fs_issue_flush(F2FS_I_SB(inode));
}
out:
trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
@@ -353,6 +364,8 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
maxbytes, i_size_read(inode));
case SEEK_DATA:
case SEEK_HOLE:
+ if (offset < 0)
+ return -ENXIO;
return f2fs_seek_block(file, offset, whence);
}
@@ -369,7 +382,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
{
int nr_free = 0, ofs = dn->ofs_in_node;
- struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
struct f2fs_node *raw_node;
__le32 *addr;
@@ -432,7 +445,7 @@ out:
int truncate_blocks(struct inode *inode, u64 from, bool lock)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
unsigned int blocksize = inode->i_sb->s_blocksize;
struct dnode_of_data dn;
pgoff_t free_from;
@@ -463,7 +476,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
count -= dn.ofs_in_node;
- f2fs_bug_on(count < 0);
+ f2fs_bug_on(sbi, count < 0);
if (dn.ofs_in_node || IS_INODE(dn.node_page)) {
truncate_data_blocks_range(&dn, count);
@@ -547,15 +560,22 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
if (err)
return err;
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode)) {
+ if (attr->ia_valid & ATTR_SIZE) {
err = f2fs_convert_inline_data(inode, attr->ia_size, NULL);
if (err)
return err;
- truncate_setsize(inode, attr->ia_size);
- f2fs_truncate(inode);
- f2fs_balance_fs(F2FS_SB(inode->i_sb));
+ if (attr->ia_size != i_size_read(inode)) {
+ truncate_setsize(inode, attr->ia_size);
+ f2fs_truncate(inode);
+ f2fs_balance_fs(F2FS_I_SB(inode));
+ } else {
+ /*
+ * giving a chance to truncate blocks past EOF which
+ * are fallocated with FALLOC_FL_KEEP_SIZE.
+ */
+ f2fs_truncate(inode);
+ }
}
__setattr_copy(inode, attr);
@@ -589,7 +609,7 @@ const struct inode_operations f2fs_file_inode_operations = {
static void fill_zero(struct inode *inode, pgoff_t index,
loff_t start, loff_t len)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *page;
if (!len)
@@ -638,6 +658,13 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
loff_t off_start, off_end;
int ret = 0;
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ /* skip punching hole beyond i_size */
+ if (offset >= inode->i_size)
+ return ret;
+
ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL);
if (ret)
return ret;
@@ -661,7 +688,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (pg_start < pg_end) {
struct address_space *mapping = inode->i_mapping;
loff_t blk_start, blk_end;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
f2fs_balance_fs(sbi);
@@ -682,7 +709,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
static int expand_inode_data(struct inode *inode, loff_t offset,
loff_t len, int mode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
pgoff_t index, pg_start, pg_end;
loff_t new_size = i_size_read(inode);
loff_t off_start, off_end;
@@ -778,61 +805,157 @@ static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
return flags & F2FS_OTHER_FLMASK;
}
-long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+static int f2fs_ioc_getflags(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE;
+ return put_user(flags, (int __user *)arg);
+}
+
+static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
struct f2fs_inode_info *fi = F2FS_I(inode);
- unsigned int flags;
+ unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE;
+ unsigned int oldflags;
int ret;
- switch (cmd) {
- case F2FS_IOC_GETFLAGS:
- flags = fi->i_flags & FS_FL_USER_VISIBLE;
- return put_user(flags, (int __user *) arg);
- case F2FS_IOC_SETFLAGS:
- {
- unsigned int oldflags;
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
- ret = mnt_want_write_file(filp);
- if (ret)
- return ret;
+ if (!inode_owner_or_capable(inode)) {
+ ret = -EACCES;
+ goto out;
+ }
- if (!inode_owner_or_capable(inode)) {
- ret = -EACCES;
- goto out;
- }
+ if (get_user(flags, (int __user *)arg)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ flags = f2fs_mask_flags(inode->i_mode, flags);
+
+ mutex_lock(&inode->i_mutex);
- if (get_user(flags, (int __user *) arg)) {
- ret = -EFAULT;
+ oldflags = fi->i_flags;
+
+ if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+ if (!capable(CAP_LINUX_IMMUTABLE)) {
+ mutex_unlock(&inode->i_mutex);
+ ret = -EPERM;
goto out;
}
+ }
- flags = f2fs_mask_flags(inode->i_mode, flags);
+ flags = flags & FS_FL_USER_MODIFIABLE;
+ flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
+ fi->i_flags = flags;
+ mutex_unlock(&inode->i_mutex);
- mutex_lock(&inode->i_mutex);
+ f2fs_set_inode_flags(inode);
+ inode->i_ctime = CURRENT_TIME;
+ mark_inode_dirty(inode);
+out:
+ mnt_drop_write_file(filp);
+ return ret;
+}
- oldflags = fi->i_flags;
+static int f2fs_ioc_start_atomic_write(struct file *filp)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
- if (!capable(CAP_LINUX_IMMUTABLE)) {
- mutex_unlock(&inode->i_mutex);
- ret = -EPERM;
- goto out;
- }
- }
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
- flags = flags & FS_FL_USER_MODIFIABLE;
- flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
- fi->i_flags = flags;
- mutex_unlock(&inode->i_mutex);
+ f2fs_balance_fs(sbi);
- f2fs_set_inode_flags(inode);
- inode->i_ctime = CURRENT_TIME;
- mark_inode_dirty(inode);
-out:
- mnt_drop_write_file(filp);
+ set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+
+ return f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL);
+}
+
+static int f2fs_ioc_commit_atomic_write(struct file *filp)
+{
+ struct inode *inode = file_inode(filp);
+ int ret;
+
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ if (f2fs_is_volatile_file(inode))
+ return 0;
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
return ret;
- }
+
+ if (f2fs_is_atomic_file(inode))
+ commit_inmem_pages(inode, false);
+
+ ret = f2fs_sync_file(filp, 0, LONG_MAX, 0);
+ mnt_drop_write_file(filp);
+ return ret;
+}
+
+static int f2fs_ioc_start_volatile_write(struct file *filp)
+{
+ struct inode *inode = file_inode(filp);
+
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+ return 0;
+}
+
+static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct super_block *sb = inode->i_sb;
+ struct request_queue *q = bdev_get_queue(sb->s_bdev);
+ struct fstrim_range range;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&range, (struct fstrim_range __user *)arg,
+ sizeof(range)))
+ return -EFAULT;
+
+ range.minlen = max((unsigned int)range.minlen,
+ q->limits.discard_granularity);
+ ret = f2fs_trim_fs(F2FS_SB(sb), &range);
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user((struct fstrim_range __user *)arg, &range,
+ sizeof(range)))
+ return -EFAULT;
+ return 0;
+}
+
+long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case F2FS_IOC_GETFLAGS:
+ return f2fs_ioc_getflags(filp, arg);
+ case F2FS_IOC_SETFLAGS:
+ return f2fs_ioc_setflags(filp, arg);
+ case F2FS_IOC_START_ATOMIC_WRITE:
+ return f2fs_ioc_start_atomic_write(filp);
+ case F2FS_IOC_COMMIT_ATOMIC_WRITE:
+ return f2fs_ioc_commit_atomic_write(filp);
+ case F2FS_IOC_START_VOLATILE_WRITE:
+ return f2fs_ioc_start_volatile_write(filp);
+ case FITRIM:
+ return f2fs_ioc_fitrim(filp, arg);
default:
return -ENOTTY;
}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 943a31db7cc3..2a8f4acdb86b 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -193,7 +193,7 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
* selected by background GC before.
* Those segments guarantee they have small valid blocks.
*/
- for_each_set_bit(secno, dirty_i->victim_secmap, TOTAL_SECS(sbi)) {
+ for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) {
if (sec_usage_check(sbi, secno))
continue;
clear_bit(secno, dirty_i->victim_secmap);
@@ -263,14 +263,14 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
unsigned int secno, max_cost;
int nsearched = 0;
+ mutex_lock(&dirty_i->seglist_lock);
+
p.alloc_mode = alloc_mode;
select_policy(sbi, gc_type, type, &p);
p.min_segno = NULL_SEGNO;
p.min_cost = max_cost = get_max_cost(sbi, &p);
- mutex_lock(&dirty_i->seglist_lock);
-
if (p.alloc_mode == LFS && gc_type == FG_GC) {
p.min_segno = check_bg_victims(sbi);
if (p.min_segno != NULL_SEGNO)
@@ -281,9 +281,8 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
unsigned long cost;
unsigned int segno;
- segno = find_next_bit(p.dirty_segmap,
- TOTAL_SEGS(sbi), p.offset);
- if (segno >= TOTAL_SEGS(sbi)) {
+ segno = find_next_bit(p.dirty_segmap, MAIN_SEGS(sbi), p.offset);
+ if (segno >= MAIN_SEGS(sbi)) {
if (sbi->last_victim[p.gc_mode]) {
sbi->last_victim[p.gc_mode] = 0;
p.offset = 0;
@@ -423,6 +422,12 @@ next_step:
if (IS_ERR(node_page))
continue;
+ /* block may become invalid during get_node_page */
+ if (check_valid_map(sbi, segno, off) == 0) {
+ f2fs_put_page(node_page, 1);
+ continue;
+ }
+
/* set page dirty and write it */
if (gc_type == FG_GC) {
f2fs_wait_on_page_writeback(node_page, NODE);
@@ -531,7 +536,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
f2fs_wait_on_page_writeback(page, DATA);
if (clear_page_dirty_for_io(page))
- inode_dec_dirty_dents(inode);
+ inode_dec_dirty_pages(inode);
set_cold_data(page);
do_write_data_page(page, &fio);
clear_cold_data(page);
@@ -688,6 +693,9 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
int gc_type = BG_GC;
int nfree = 0;
int ret = -1;
+ struct cp_control cpc = {
+ .reason = CP_SYNC,
+ };
INIT_LIST_HEAD(&ilist);
gc_more:
@@ -698,7 +706,7 @@ gc_more:
if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
gc_type = FG_GC;
- write_checkpoint(sbi, false);
+ write_checkpoint(sbi, &cpc);
}
if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE))
@@ -723,7 +731,7 @@ gc_more:
goto gc_more;
if (gc_type == FG_GC)
- write_checkpoint(sbi, false);
+ write_checkpoint(sbi, &cpc);
stop:
mutex_unlock(&sbi->gc_mutex);
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 3e8ecdf3742b..88036fd75797 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -15,11 +15,13 @@
bool f2fs_may_inline(struct inode *inode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
block_t nr_blocks;
loff_t i_size;
- if (!test_opt(sbi, INLINE_DATA))
+ if (!test_opt(F2FS_I_SB(inode), INLINE_DATA))
+ return false;
+
+ if (f2fs_is_atomic_file(inode))
return false;
nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2;
@@ -35,7 +37,6 @@ bool f2fs_may_inline(struct inode *inode)
int f2fs_read_inline_data(struct inode *inode, struct page *page)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct page *ipage;
void *src_addr, *dst_addr;
@@ -44,7 +45,7 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
goto out;
}
- ipage = get_node_page(sbi, inode->i_ino);
+ ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
if (IS_ERR(ipage)) {
unlock_page(page);
return PTR_ERR(ipage);
@@ -73,7 +74,7 @@ static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
struct dnode_of_data dn;
void *src_addr, *dst_addr;
block_t new_blk_addr;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_io_info fio = {
.type = DATA,
.rw = WRITE_SYNC | REQ_PRIO,
@@ -189,13 +190,12 @@ int f2fs_write_inline_data(struct inode *inode,
void truncate_inline_data(struct inode *inode, u64 from)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct page *ipage;
if (from >= MAX_INLINE_DATA)
return;
- ipage = get_node_page(sbi, inode->i_ino);
+ ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
if (IS_ERR(ipage))
return;
@@ -209,7 +209,7 @@ void truncate_inline_data(struct inode *inode, u64 from)
bool recover_inline_data(struct inode *inode, struct page *npage)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode *ri = NULL;
void *src_addr, *dst_addr;
struct page *ipage;
@@ -229,7 +229,7 @@ bool recover_inline_data(struct inode *inode, struct page *npage)
ri && (ri->i_inline & F2FS_INLINE_DATA)) {
process_inline:
ipage = get_node_page(sbi, inode->i_ino);
- f2fs_bug_on(IS_ERR(ipage));
+ f2fs_bug_on(sbi, IS_ERR(ipage));
f2fs_wait_on_page_writeback(ipage, NODE);
@@ -243,7 +243,7 @@ process_inline:
if (f2fs_has_inline_data(inode)) {
ipage = get_node_page(sbi, inode->i_ino);
- f2fs_bug_on(IS_ERR(ipage));
+ f2fs_bug_on(sbi, IS_ERR(ipage));
f2fs_wait_on_page_writeback(ipage, NODE);
zero_user_segment(ipage, INLINE_DATA_OFFSET,
INLINE_DATA_OFFSET + MAX_INLINE_DATA);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 2c39999f3868..0deead4505e7 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -69,7 +69,7 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
static int do_read_inode(struct inode *inode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
struct page *node_page;
struct f2fs_inode *ri;
@@ -218,7 +218,7 @@ void update_inode(struct inode *inode, struct page *node_page)
void update_inode_page(struct inode *inode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *node_page;
retry:
node_page = get_node_page(sbi, inode->i_ino);
@@ -238,7 +238,7 @@ retry:
int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
if (inode->i_ino == F2FS_NODE_INO(sbi) ||
inode->i_ino == F2FS_META_INO(sbi))
@@ -266,9 +266,13 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
*/
void f2fs_evict_inode(struct inode *inode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
nid_t xnid = F2FS_I(inode)->i_xattr_nid;
+ /* some remained atomic pages should discarded */
+ if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
+ commit_inmem_pages(inode, true);
+
trace_f2fs_evict_inode(inode);
truncate_inode_pages_final(&inode->i_data);
@@ -276,7 +280,7 @@ void f2fs_evict_inode(struct inode *inode)
inode->i_ino == F2FS_META_INO(sbi))
goto out_clear;
- f2fs_bug_on(get_dirty_dents(inode));
+ f2fs_bug_on(sbi, get_dirty_pages(inode));
remove_dirty_dir_inode(inode);
if (inode->i_nlink || is_bad_inode(inode))
@@ -306,3 +310,26 @@ no_delete:
out_clear:
clear_inode(inode);
}
+
+/* caller should call f2fs_lock_op() */
+void handle_failed_inode(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ clear_nlink(inode);
+ make_bad_inode(inode);
+ unlock_new_inode(inode);
+
+ i_size_write(inode, 0);
+ if (F2FS_HAS_BLOCKS(inode))
+ f2fs_truncate(inode);
+
+ remove_inode_page(inode);
+ stat_dec_inline_inode(inode);
+
+ alloc_nid_failed(sbi, inode->i_ino);
+ f2fs_unlock_op(sbi);
+
+ /* iput will drop the inode object */
+ iput(inode);
+}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index ee103fd7283c..0d2526e5aa11 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -23,7 +23,7 @@
static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
nid_t ino;
struct inode *inode;
bool nid_free = false;
@@ -102,7 +102,7 @@ static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode,
static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
nid_t ino = 0;
int err;
@@ -123,9 +123,9 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
- f2fs_unlock_op(sbi);
if (err)
goto out;
+ f2fs_unlock_op(sbi);
alloc_nid_done(sbi, ino);
@@ -133,9 +133,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
unlock_new_inode(inode);
return 0;
out:
- clear_nlink(inode);
- iget_failed(inode);
- alloc_nid_failed(sbi, ino);
+ handle_failed_inode(inode);
return err;
}
@@ -143,7 +141,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
struct inode *inode = old_dentry->d_inode;
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
int err;
f2fs_balance_fs(sbi);
@@ -154,15 +152,16 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
set_inode_flag(F2FS_I(inode), FI_INC_LINK);
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
- f2fs_unlock_op(sbi);
if (err)
goto out;
+ f2fs_unlock_op(sbi);
d_instantiate(dentry, inode);
return 0;
out:
clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
iput(inode);
+ f2fs_unlock_op(sbi);
return err;
}
@@ -203,7 +202,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode = dentry->d_inode;
struct f2fs_dir_entry *de;
struct page *page;
@@ -237,7 +236,7 @@ fail:
static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
const char *symname)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
size_t symlen = strlen(symname) + 1;
int err;
@@ -253,9 +252,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
- f2fs_unlock_op(sbi);
if (err)
goto out;
+ f2fs_unlock_op(sbi);
err = page_symlink(inode, symname, symlen);
alloc_nid_done(sbi, inode->i_ino);
@@ -264,15 +263,13 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
unlock_new_inode(inode);
return err;
out:
- clear_nlink(inode);
- iget_failed(inode);
- alloc_nid_failed(sbi, inode->i_ino);
+ handle_failed_inode(inode);
return err;
}
static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
int err;
@@ -290,9 +287,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
set_inode_flag(F2FS_I(inode), FI_INC_LINK);
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
- f2fs_unlock_op(sbi);
if (err)
goto out_fail;
+ f2fs_unlock_op(sbi);
alloc_nid_done(sbi, inode->i_ino);
@@ -303,9 +300,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
out_fail:
clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
- clear_nlink(inode);
- iget_failed(inode);
- alloc_nid_failed(sbi, inode->i_ino);
+ handle_failed_inode(inode);
return err;
}
@@ -320,7 +315,7 @@ static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t rdev)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
int err = 0;
@@ -338,25 +333,23 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
- f2fs_unlock_op(sbi);
if (err)
goto out;
+ f2fs_unlock_op(sbi);
alloc_nid_done(sbi, inode->i_ino);
d_instantiate(dentry, inode);
unlock_new_inode(inode);
return 0;
out:
- clear_nlink(inode);
- iget_failed(inode);
- alloc_nid_failed(sbi, inode->i_ino);
+ handle_failed_inode(inode);
return err;
}
static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
- struct f2fs_sb_info *sbi = F2FS_SB(old_dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
struct inode *old_inode = old_dentry->d_inode;
struct inode *new_inode = new_dentry->d_inode;
struct page *old_dir_page;
@@ -480,8 +473,7 @@ out:
static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
- struct super_block *sb = old_dir->i_sb;
- struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
struct inode *old_inode = old_dentry->d_inode;
struct inode *new_inode = new_dentry->d_inode;
struct page *old_dir_page, *new_dir_page;
@@ -642,7 +634,7 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
int err;
@@ -678,10 +670,7 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
release_out:
release_orphan_inode(sbi);
out:
- f2fs_unlock_op(sbi);
- clear_nlink(inode);
- iget_failed(inode);
- alloc_nid_failed(sbi, inode->i_ino);
+ handle_failed_inode(inode);
return err;
}
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 45378196e19a..44b8afef43d9 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -54,7 +54,6 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
static void clear_node_page_dirty(struct page *page)
{
struct address_space *mapping = page->mapping;
- struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
unsigned int long flags;
if (PageDirty(page)) {
@@ -65,7 +64,7 @@ static void clear_node_page_dirty(struct page *page)
spin_unlock_irqrestore(&mapping->tree_lock, flags);
clear_page_dirty_for_io(page);
- dec_page_count(sbi, F2FS_DIRTY_NODES);
+ dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES);
}
ClearPageUptodate(page);
}
@@ -92,7 +91,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
/* get current nat block page with lock */
src_page = get_meta_page(sbi, src_off);
dst_page = grab_meta_page(sbi, dst_off);
- f2fs_bug_on(PageDirty(src_page));
+ f2fs_bug_on(sbi, PageDirty(src_page));
src_addr = page_address(src_page);
dst_addr = page_address(dst_page);
@@ -124,44 +123,99 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
kmem_cache_free(nat_entry_slab, e);
}
-int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
+static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
+ struct nat_entry *ne)
+{
+ nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
+ struct nat_entry_set *head;
+
+ if (get_nat_flag(ne, IS_DIRTY))
+ return;
+retry:
+ head = radix_tree_lookup(&nm_i->nat_set_root, set);
+ if (!head) {
+ head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
+
+ INIT_LIST_HEAD(&head->entry_list);
+ INIT_LIST_HEAD(&head->set_list);
+ head->set = set;
+ head->entry_cnt = 0;
+
+ if (radix_tree_insert(&nm_i->nat_set_root, set, head)) {
+ cond_resched();
+ goto retry;
+ }
+ }
+ list_move_tail(&ne->list, &head->entry_list);
+ nm_i->dirty_nat_cnt++;
+ head->entry_cnt++;
+ set_nat_flag(ne, IS_DIRTY, true);
+}
+
+static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
+ struct nat_entry *ne)
+{
+ nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK;
+ struct nat_entry_set *head;
+
+ head = radix_tree_lookup(&nm_i->nat_set_root, set);
+ if (head) {
+ list_move_tail(&ne->list, &nm_i->nat_entries);
+ set_nat_flag(ne, IS_DIRTY, false);
+ head->entry_cnt--;
+ nm_i->dirty_nat_cnt--;
+ }
+}
+
+static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
+ nid_t start, unsigned int nr, struct nat_entry_set **ep)
+{
+ return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep,
+ start, nr);
+}
+
+bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
- int is_cp = 1;
+ bool is_cp = true;
read_lock(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
- if (e && !e->checkpointed)
- is_cp = 0;
+ if (e && !get_nat_flag(e, IS_CHECKPOINTED))
+ is_cp = false;
read_unlock(&nm_i->nat_tree_lock);
return is_cp;
}
-bool fsync_mark_done(struct f2fs_sb_info *sbi, nid_t nid)
+bool has_fsynced_inode(struct f2fs_sb_info *sbi, nid_t ino)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
- bool fsync_done = false;
+ bool fsynced = false;
read_lock(&nm_i->nat_tree_lock);
- e = __lookup_nat_cache(nm_i, nid);
- if (e)
- fsync_done = e->fsync_done;
+ e = __lookup_nat_cache(nm_i, ino);
+ if (e && get_nat_flag(e, HAS_FSYNCED_INODE))
+ fsynced = true;
read_unlock(&nm_i->nat_tree_lock);
- return fsync_done;
+ return fsynced;
}
-void fsync_mark_clear(struct f2fs_sb_info *sbi, nid_t nid)
+bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
+ bool need_update = true;
- write_lock(&nm_i->nat_tree_lock);
- e = __lookup_nat_cache(nm_i, nid);
- if (e)
- e->fsync_done = false;
- write_unlock(&nm_i->nat_tree_lock);
+ read_lock(&nm_i->nat_tree_lock);
+ e = __lookup_nat_cache(nm_i, ino);
+ if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
+ (get_nat_flag(e, IS_CHECKPOINTED) ||
+ get_nat_flag(e, HAS_FSYNCED_INODE)))
+ need_update = false;
+ read_unlock(&nm_i->nat_tree_lock);
+ return need_update;
}
static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
@@ -177,7 +231,7 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
}
memset(new, 0, sizeof(struct nat_entry));
nat_set_nid(new, nid);
- new->checkpointed = true;
+ nat_reset_flag(new);
list_add_tail(&new->list, &nm_i->nat_entries);
nm_i->nat_cnt++;
return new;
@@ -216,7 +270,7 @@ retry:
goto retry;
}
e->ni = *ni;
- f2fs_bug_on(ni->blk_addr == NEW_ADDR);
+ f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
} else if (new_blkaddr == NEW_ADDR) {
/*
* when nid is reallocated,
@@ -224,16 +278,16 @@ retry:
* So, reinitialize it with new information.
*/
e->ni = *ni;
- f2fs_bug_on(ni->blk_addr != NULL_ADDR);
+ f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR);
}
/* sanity check */
- f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr);
- f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR &&
+ f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr);
+ f2fs_bug_on(sbi, nat_get_blkaddr(e) == NULL_ADDR &&
new_blkaddr == NULL_ADDR);
- f2fs_bug_on(nat_get_blkaddr(e) == NEW_ADDR &&
+ f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR &&
new_blkaddr == NEW_ADDR);
- f2fs_bug_on(nat_get_blkaddr(e) != NEW_ADDR &&
+ f2fs_bug_on(sbi, nat_get_blkaddr(e) != NEW_ADDR &&
nat_get_blkaddr(e) != NULL_ADDR &&
new_blkaddr == NEW_ADDR);
@@ -245,12 +299,17 @@ retry:
/* change address */
nat_set_blkaddr(e, new_blkaddr);
+ if (new_blkaddr == NEW_ADDR || new_blkaddr == NULL_ADDR)
+ set_nat_flag(e, IS_CHECKPOINTED, false);
__set_nat_cache_dirty(nm_i, e);
/* update fsync_mark if its inode nat entry is still alive */
e = __lookup_nat_cache(nm_i, ni->ino);
- if (e)
- e->fsync_done = fsync_done;
+ if (e) {
+ if (fsync_done && ni->nid == ni->ino)
+ set_nat_flag(e, HAS_FSYNCED_INODE, true);
+ set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
+ }
w