aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--CREDITS6
-rw-r--r--Documentation/filesystems/gfs2.txt43
-rw-r--r--MAINTAINERS18
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/Makefile2
-rw-r--r--fs/configfs/item.c2
-rw-r--r--fs/dlm/Kconfig21
-rw-r--r--fs/dlm/Makefile19
-rw-r--r--fs/dlm/ast.c173
-rw-r--r--fs/dlm/ast.h26
-rw-r--r--fs/dlm/config.c789
-rw-r--r--fs/dlm/config.h42
-rw-r--r--fs/dlm/debug_fs.c387
-rw-r--r--fs/dlm/dir.c423
-rw-r--r--fs/dlm/dir.h30
-rw-r--r--fs/dlm/dlm_internal.h543
-rw-r--r--fs/dlm/lock.c3871
-rw-r--r--fs/dlm/lock.h62
-rw-r--r--fs/dlm/lockspace.c717
-rw-r--r--fs/dlm/lockspace.h25
-rw-r--r--fs/dlm/lowcomms.c1238
-rw-r--r--fs/dlm/lowcomms.h26
-rw-r--r--fs/dlm/lvb_table.h18
-rw-r--r--fs/dlm/main.c97
-rw-r--r--fs/dlm/member.c327
-rw-r--r--fs/dlm/member.h24
-rw-r--r--fs/dlm/memory.c116
-rw-r--r--fs/dlm/memory.h29
-rw-r--r--fs/dlm/midcomms.c140
-rw-r--r--fs/dlm/midcomms.h21
-rw-r--r--fs/dlm/rcom.c472
-rw-r--r--fs/dlm/rcom.h24
-rw-r--r--fs/dlm/recover.c765
-rw-r--r--fs/dlm/recover.h34
-rw-r--r--fs/dlm/recoverd.c290
-rw-r--r--fs/dlm/recoverd.h24
-rw-r--r--fs/dlm/requestqueue.c184
-rw-r--r--fs/dlm/requestqueue.h22
-rw-r--r--fs/dlm/user.c788
-rw-r--r--fs/dlm/user.h16
-rw-r--r--fs/dlm/util.c161
-rw-r--r--fs/dlm/util.h22
-rw-r--r--fs/gfs2/Kconfig44
-rw-r--r--fs/gfs2/Makefile10
-rw-r--r--fs/gfs2/acl.c309
-rw-r--r--fs/gfs2/acl.h39
-rw-r--r--fs/gfs2/bmap.c1221
-rw-r--r--fs/gfs2/bmap.h31
-rw-r--r--fs/gfs2/daemon.c196
-rw-r--r--fs/gfs2/daemon.h19
-rw-r--r--fs/gfs2/dir.c1961
-rw-r--r--fs/gfs2/dir.h79
-rw-r--r--fs/gfs2/eaops.c230
-rw-r--r--fs/gfs2/eaops.h30
-rw-r--r--fs/gfs2/eattr.c1501
-rw-r--r--fs/gfs2/eattr.h100
-rw-r--r--fs/gfs2/gfs2.h31
-rw-r--r--fs/gfs2/glock.c2231
-rw-r--r--fs/gfs2/glock.h153
-rw-r--r--fs/gfs2/glops.c615
-rw-r--r--fs/gfs2/glops.h25
-rw-r--r--fs/gfs2/incore.h634
-rw-r--r--fs/gfs2/inode.c1379
-rw-r--r--fs/gfs2/inode.h56
-rw-r--r--fs/gfs2/lm.c217
-rw-r--r--fs/gfs2/lm.h42
-rw-r--r--fs/gfs2/locking.c184
-rw-r--r--fs/gfs2/locking/dlm/Makefile3
-rw-r--r--fs/gfs2/locking/dlm/lock.c524
-rw-r--r--fs/gfs2/locking/dlm/lock_dlm.h187
-rw-r--r--fs/gfs2/locking/dlm/main.c64
-rw-r--r--fs/gfs2/locking/dlm/mount.c255
-rw-r--r--fs/gfs2/locking/dlm/plock.c301
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c226
-rw-r--r--fs/gfs2/locking/dlm/thread.c359
-rw-r--r--fs/gfs2/locking/nolock/Makefile3
-rw-r--r--fs/gfs2/locking/nolock/main.c246
-rw-r--r--fs/gfs2/log.c687
-rw-r--r--fs/gfs2/log.h65
-rw-r--r--fs/gfs2/lops.c809
-rw-r--r--fs/gfs2/lops.h99
-rw-r--r--fs/gfs2/main.c150
-rw-r--r--fs/gfs2/meta_io.c590
-rw-r--r--fs/gfs2/meta_io.h78
-rw-r--r--fs/gfs2/mount.c214
-rw-r--r--fs/gfs2/mount.h17
-rw-r--r--fs/gfs2/ondisk.c308
-rw-r--r--fs/gfs2/ops_address.c790
-rw-r--r--fs/gfs2/ops_address.h22
-rw-r--r--fs/gfs2/ops_dentry.c119
-rw-r--r--fs/gfs2/ops_dentry.h17
-rw-r--r--fs/gfs2/ops_export.c298
-rw-r--r--fs/gfs2/ops_export.h22
-rw-r--r--fs/gfs2/ops_file.c661
-rw-r--r--fs/gfs2/ops_file.h24
-rw-r--r--fs/gfs2/ops_fstype.c928
-rw-r--r--fs/gfs2/ops_fstype.h18
-rw-r--r--fs/gfs2/ops_inode.c1151
-rw-r--r--fs/gfs2/ops_inode.h20
-rw-r--r--fs/gfs2/ops_super.c468
-rw-r--r--fs/gfs2/ops_super.h17
-rw-r--r--fs/gfs2/ops_vm.c184
-rw-r--r--fs/gfs2/ops_vm.h18
-rw-r--r--fs/gfs2/quota.c1227
-rw-r--r--fs/gfs2/quota.h35
-rw-r--r--fs/gfs2/recovery.c570
-rw-r--r--fs/gfs2/recovery.h34
-rw-r--r--fs/gfs2/rgrp.c1513
-rw-r--r--fs/gfs2/rgrp.h69
-rw-r--r--fs/gfs2/super.c976
-rw-r--r--fs/gfs2/super.h55
-rw-r--r--fs/gfs2/sys.c583
-rw-r--r--fs/gfs2/sys.h27
-rw-r--r--fs/gfs2/trans.c184
-rw-r--r--fs/gfs2/trans.h39
-rw-r--r--fs/gfs2/util.c245
-rw-r--r--fs/gfs2/util.h170
-rw-r--r--include/linux/Kbuild4
-rw-r--r--include/linux/dlm.h302
-rw-r--r--include/linux/dlm_device.h86
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/gfs2_ondisk.h443
-rw-r--r--include/linux/lm_interface.h273
-rw-r--r--include/linux/lock_dlm_plock.h41
-rw-r--r--mm/filemap.c6
-rw-r--r--mm/readahead.c1
126 files changed, 40197 insertions, 6 deletions
diff --git a/CREDITS b/CREDITS
index 5d75254bcb81..5329ead9c672 100644
--- a/CREDITS
+++ b/CREDITS
@@ -3578,11 +3578,11 @@ S: Fargo, North Dakota 58122
S: USA
N: Steven Whitehouse
-E: SteveW@ACM.org
+E: steve@chygwyn.com
W: http://www.chygwyn.com/~steve
-D: Linux DECnet project: http://www.sucs.swan.ac.uk/~rohan/DECnet/index.html
+D: Linux DECnet project
D: Minor debugging of other networking protocols.
-D: Misc bug fixes and filesystem development
+D: Misc bug fixes and GFS2 filesystem development
N: Hans-Joachim Widmaier
E: hjw@zvw.de
diff --git a/Documentation/filesystems/gfs2.txt b/Documentation/filesystems/gfs2.txt
new file mode 100644
index 000000000000..593004b6bbab
--- /dev/null
+++ b/Documentation/filesystems/gfs2.txt
@@ -0,0 +1,43 @@
+Global File System
+------------------
+
+http://sources.redhat.com/cluster/
+
+GFS is a cluster file system. It allows a cluster of computers to
+simultaneously use a block device that is shared between them (with FC,
+iSCSI, NBD, etc). GFS reads and writes to the block device like a local
+file system, but also uses a lock module to allow the computers coordinate
+their I/O so file system consistency is maintained. One of the nifty
+features of GFS is perfect consistency -- changes made to the file system
+on one machine show up immediately on all other machines in the cluster.
+
+GFS uses interchangable inter-node locking mechanisms. Different lock
+modules can plug into GFS and each file system selects the appropriate
+lock module at mount time. Lock modules include:
+
+ lock_nolock -- allows gfs to be used as a local file system
+
+ lock_dlm -- uses a distributed lock manager (dlm) for inter-node locking
+ The dlm is found at linux/fs/dlm/
+
+In addition to interfacing with an external locking manager, a gfs lock
+module is responsible for interacting with external cluster management
+systems. Lock_dlm depends on user space cluster management systems found
+at the URL above.
+
+To use gfs as a local file system, no external clustering systems are
+needed, simply:
+
+ $ mkfs -t gfs2 -p lock_nolock -j 1 /dev/block_device
+ $ mount -t gfs2 /dev/block_device /dir
+
+GFS2 is not on-disk compatible with previous versions of GFS.
+
+The following man pages can be found at the URL above:
+ gfs2_fsck to repair a filesystem
+ gfs2_grow to expand a filesystem online
+ gfs2_jadd to add journals to a filesystem online
+ gfs2_tool to manipulate, examine and tune a filesystem
+ gfs2_quota to examine and change quota values in a filesystem
+ mount.gfs2 to help mount(8) mount a filesystem
+ mkfs.gfs2 to make a filesystem
diff --git a/MAINTAINERS b/MAINTAINERS
index 8c35b3c503aa..17becb9b1a96 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -898,6 +898,16 @@ M: jack@suse.cz
L: linux-kernel@vger.kernel.org
S: Maintained
+DISTRIBUTED LOCK MANAGER
+P: Patrick Caulfield
+M: pcaulfie@redhat.com
+P: David Teigland
+M: teigland@redhat.com
+L: cluster-devel@redhat.com
+W: http://sources.redhat.com/cluster/
+T: git kernel.org:/pub/scm/linux/kernel/git/steve/gfs-2.6.git
+S: Supported
+
DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER
P: Tobias Ringstrom
M: tori@unhappy.mine.nu
@@ -1173,6 +1183,14 @@ M: khc@pm.waw.pl
W: http://www.kernel.org/pub/linux/utils/net/hdlc/
S: Maintained
+GFS2 FILE SYSTEM
+P: Steven Whitehouse
+M: swhiteho@redhat.com
+L: cluster-devel@redhat.com
+W: http://sources.redhat.com/cluster/
+T: git kernel.org:/pub/scm/linux/kernel/git/steve/gfs-2.6.git
+S: Supported
+
GIGASET ISDN DRIVERS
P: Hansjoerg Lipp
M: hjlipp@web.de
diff --git a/fs/Kconfig b/fs/Kconfig
index 674cfbb83a95..599de54451af 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -325,6 +325,7 @@ config FS_POSIX_ACL
default n
source "fs/xfs/Kconfig"
+source "fs/gfs2/Kconfig"
config OCFS2_FS
tristate "OCFS2 file system support"
@@ -1995,6 +1996,7 @@ endmenu
endif
source "fs/nls/Kconfig"
+source "fs/dlm/Kconfig"
endmenu
diff --git a/fs/Makefile b/fs/Makefile
index fd24d67a7cdb..df614eacee86 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -57,6 +57,7 @@ obj-$(CONFIG_CONFIGFS_FS) += configfs/
obj-y += devpts/
obj-$(CONFIG_PROFILING) += dcookies.o
+obj-$(CONFIG_DLM) += dlm/
# Do not add any filesystems before this line
obj-$(CONFIG_REISERFS_FS) += reiserfs/
@@ -110,3 +111,4 @@ obj-$(CONFIG_HOSTFS) += hostfs/
obj-$(CONFIG_HPPFS) += hppfs/
obj-$(CONFIG_DEBUG_FS) += debugfs/
obj-$(CONFIG_OCFS2_FS) += ocfs2/
+obj-$(CONFIG_GFS2_FS) += gfs2/
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index e07485ac50ad..24421209f854 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -224,4 +224,4 @@ EXPORT_SYMBOL(config_item_init);
EXPORT_SYMBOL(config_group_init);
EXPORT_SYMBOL(config_item_get);
EXPORT_SYMBOL(config_item_put);
-
+EXPORT_SYMBOL(config_group_find_obj);
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
new file mode 100644
index 000000000000..490f85b3fa59
--- /dev/null
+++ b/fs/dlm/Kconfig
@@ -0,0 +1,21 @@
+menu "Distributed Lock Manager"
+ depends on INET && EXPERIMENTAL
+
+config DLM
+ tristate "Distributed Lock Manager (DLM)"
+ depends on IPV6 || IPV6=n
+ depends on IP_SCTP
+ select CONFIGFS_FS
+ help
+ A general purpose distributed lock manager for kernel or userspace
+ applications.
+
+config DLM_DEBUG
+ bool "DLM debugging"
+ depends on DLM
+ help
+ Under the debugfs mount point, the name of each lockspace will
+ appear as a file in the "dlm" directory. The output is the
+ list of resource and locks the local node knows about.
+
+endmenu
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
new file mode 100644
index 000000000000..1832e0297f7d
--- /dev/null
+++ b/fs/dlm/Makefile
@@ -0,0 +1,19 @@
+obj-$(CONFIG_DLM) += dlm.o
+dlm-y := ast.o \
+ config.o \
+ dir.o \
+ lock.o \
+ lockspace.o \
+ lowcomms.o \
+ main.o \
+ member.o \
+ memory.o \
+ midcomms.o \
+ rcom.o \
+ recover.o \
+ recoverd.o \
+ requestqueue.o \
+ user.o \
+ util.o
+dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o
+
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
new file mode 100644
index 000000000000..f91d39cb1e0b
--- /dev/null
+++ b/fs/dlm/ast.c
@@ -0,0 +1,173 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lock.h"
+#include "user.h"
+
+#define WAKE_ASTS 0
+
+static struct list_head ast_queue;
+static spinlock_t ast_queue_lock;
+static struct task_struct * astd_task;
+static unsigned long astd_wakeflags;
+static struct mutex astd_running;
+
+
+void dlm_del_ast(struct dlm_lkb *lkb)
+{
+ spin_lock(&ast_queue_lock);
+ if (lkb->lkb_ast_type & (AST_COMP | AST_BAST))
+ list_del(&lkb->lkb_astqueue);
+ spin_unlock(&ast_queue_lock);
+}
+
+void dlm_add_ast(struct dlm_lkb *lkb, int type)
+{
+ if (lkb->lkb_flags & DLM_IFL_USER) {
+ dlm_user_add_ast(lkb, type);
+ return;
+ }
+ DLM_ASSERT(lkb->lkb_astaddr != DLM_FAKE_USER_AST, dlm_print_lkb(lkb););
+
+ spin_lock(&ast_queue_lock);
+ if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
+ kref_get(&lkb->lkb_ref);
+ list_add_tail(&lkb->lkb_astqueue, &ast_queue);
+ }
+ lkb->lkb_ast_type |= type;
+ spin_unlock(&ast_queue_lock);
+
+ set_bit(WAKE_ASTS, &astd_wakeflags);
+ wake_up_process(astd_task);
+}
+
+static void process_asts(void)
+{
+ struct dlm_ls *ls = NULL;
+ struct dlm_rsb *r = NULL;
+ struct dlm_lkb *lkb;
+ void (*cast) (long param);
+ void (*bast) (long param, int mode);
+ int type = 0, found, bmode;
+
+ for (;;) {
+ found = 0;
+ spin_lock(&ast_queue_lock);
+ list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
+ r = lkb->lkb_resource;
+ ls = r->res_ls;
+
+ if (dlm_locking_stopped(ls))
+ continue;
+
+ list_del(&lkb->lkb_astqueue);
+ type = lkb->lkb_ast_type;
+ lkb->lkb_ast_type = 0;
+ found = 1;
+ break;
+ }
+ spin_unlock(&ast_queue_lock);
+
+ if (!found)
+ break;
+
+ cast = lkb->lkb_astaddr;
+ bast = lkb->lkb_bastaddr;
+ bmode = lkb->lkb_bastmode;
+
+ if ((type & AST_COMP) && cast)
+ cast(lkb->lkb_astparam);
+
+ /* FIXME: Is it safe to look at lkb_grmode here
+ without doing a lock_rsb() ?
+ Look at other checks in v1 to avoid basts. */
+
+ if ((type & AST_BAST) && bast)
+ if (!dlm_modes_compat(lkb->lkb_grmode, bmode))
+ bast(lkb->lkb_astparam, bmode);
+
+ /* this removes the reference added by dlm_add_ast
+ and may result in the lkb being freed */
+ dlm_put_lkb(lkb);
+
+ schedule();
+ }
+}
+
+static inline int no_asts(void)
+{
+ int ret;
+
+ spin_lock(&ast_queue_lock);
+ ret = list_empty(&ast_queue);
+ spin_unlock(&ast_queue_lock);
+ return ret;
+}
+
+static int dlm_astd(void *data)
+{
+ while (!kthread_should_stop()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!test_bit(WAKE_ASTS, &astd_wakeflags))
+ schedule();
+ set_current_state(TASK_RUNNING);
+
+ mutex_lock(&astd_running);
+ if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags))
+ process_asts();
+ mutex_unlock(&astd_running);
+ }
+ return 0;
+}
+
+void dlm_astd_wake(void)
+{
+ if (!no_asts()) {
+ set_bit(WAKE_ASTS, &astd_wakeflags);
+ wake_up_process(astd_task);
+ }
+}
+
+int dlm_astd_start(void)
+{
+ struct task_struct *p;
+ int error = 0;
+
+ INIT_LIST_HEAD(&ast_queue);
+ spin_lock_init(&ast_queue_lock);
+ mutex_init(&astd_running);
+
+ p = kthread_run(dlm_astd, NULL, "dlm_astd");
+ if (IS_ERR(p))
+ error = PTR_ERR(p);
+ else
+ astd_task = p;
+ return error;
+}
+
+void dlm_astd_stop(void)
+{
+ kthread_stop(astd_task);
+}
+
+void dlm_astd_suspend(void)
+{
+ mutex_lock(&astd_running);
+}
+
+void dlm_astd_resume(void)
+{
+ mutex_unlock(&astd_running);
+}
+
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
new file mode 100644
index 000000000000..6ee276c74c52
--- /dev/null
+++ b/fs/dlm/ast.h
@@ -0,0 +1,26 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __ASTD_DOT_H__
+#define __ASTD_DOT_H__
+
+void dlm_add_ast(struct dlm_lkb *lkb, int type);
+void dlm_del_ast(struct dlm_lkb *lkb);
+
+void dlm_astd_wake(void);
+int dlm_astd_start(void);
+void dlm_astd_stop(void);
+void dlm_astd_suspend(void);
+void dlm_astd_resume(void);
+
+#endif
+
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
new file mode 100644
index 000000000000..88553054bbfa
--- /dev/null
+++ b/fs/dlm/config.c
@@ -0,0 +1,789 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/configfs.h>
+#include <net/sock.h>
+
+#include "config.h"
+#include "lowcomms.h"
+
+/*
+ * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid
+ * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight
+ * /config/dlm/<cluster>/comms/<comm>/nodeid
+ * /config/dlm/<cluster>/comms/<comm>/local
+ * /config/dlm/<cluster>/comms/<comm>/addr
+ * The <cluster> level is useless, but I haven't figured out how to avoid it.
+ */
+
+static struct config_group *space_list;
+static struct config_group *comm_list;
+static struct comm *local_comm;
+
+struct clusters;
+struct cluster;
+struct spaces;
+struct space;
+struct comms;
+struct comm;
+struct nodes;
+struct node;
+
+static struct config_group *make_cluster(struct config_group *, const char *);
+static void drop_cluster(struct config_group *, struct config_item *);
+static void release_cluster(struct config_item *);
+static struct config_group *make_space(struct config_group *, const char *);
+static void drop_space(struct config_group *, struct config_item *);
+static void release_space(struct config_item *);
+static struct config_item *make_comm(struct config_group *, const char *);
+static void drop_comm(struct config_group *, struct config_item *);
+static void release_comm(struct config_item *);
+static struct config_item *make_node(struct config_group *, const char *);
+static void drop_node(struct config_group *, struct config_item *);
+static void release_node(struct config_item *);
+
+static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
+ char *buf);
+static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
+ const char *buf, size_t len);
+static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
+ char *buf);
+static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
+ const char *buf, size_t len);
+
+static ssize_t comm_nodeid_read(struct comm *cm, char *buf);
+static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t comm_local_read(struct comm *cm, char *buf);
+static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t node_nodeid_read(struct node *nd, char *buf);
+static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len);
+static ssize_t node_weight_read(struct node *nd, char *buf);
+static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len);
+
+enum {
+ COMM_ATTR_NODEID = 0,
+ COMM_ATTR_LOCAL,
+ COMM_ATTR_ADDR,
+};
+
+struct comm_attribute {
+ struct configfs_attribute attr;
+ ssize_t (*show)(struct comm *, char *);
+ ssize_t (*store)(struct comm *, const char *, size_t);
+};
+
+static struct comm_attribute comm_attr_nodeid = {
+ .attr = { .ca_owner = THIS_MODULE,
+ .ca_name = "nodeid",
+ .ca_mode = S_IRUGO | S_IWUSR },
+ .show = comm_nodeid_read,
+ .store = comm_nodeid_write,
+};
+
+static struct comm_attribute comm_attr_local = {
+ .attr = { .ca_owner = THIS_MODULE,
+ .ca_name = "local",
+ .ca_mode = S_IRUGO | S_IWUSR },
+ .show = comm_local_read,
+ .store = comm_local_write,
+};
+
+static struct comm_attribute comm_attr_addr = {
+ .attr = { .ca_owner = THIS_MODULE,
+ .ca_name = "addr",
+ .ca_mode = S_IRUGO | S_IWUSR },
+ .store = comm_addr_write,
+};
+
+static struct configfs_attribute *comm_attrs[] = {
+ [COMM_ATTR_NODEID] = &comm_attr_nodeid.attr,
+ [COMM_ATTR_LOCAL] = &comm_attr_local.attr,
+ [COMM_ATTR_ADDR] = &comm_attr_addr.attr,
+ NULL,
+};
+
+enum {
+ NODE_ATTR_NODEID = 0,
+ NODE_ATTR_WEIGHT,
+};
+
+struct node_attribute {
+ struct configfs_attribute attr;
+ ssize_t (*show)(struct node *, char *);
+ ssize_t (*store)(struct node *, const char *, size_t);
+};
+
+static struct node_attribute node_attr_nodeid = {
+ .attr = { .ca_owner = THIS_MODULE,
+ .ca_name = "nodeid",
+ .ca_mode = S_IRUGO | S_IWUSR },
+ .show = node_nodeid_read,
+ .store = node_nodeid_write,
+};
+
+static struct node_attribute node_attr_weight = {
+ .attr = { .ca_owner = THIS_MODULE,
+ .ca_name = "weight",
+ .ca_mode = S_IRUGO | S_IWUSR },
+ .show = node_weight_read,
+ .store = node_weight_write,
+};
+
+static struct configfs_attribute *node_attrs[] = {
+ [NODE_ATTR_NODEID] = &node_attr_nodeid.attr,
+ [NODE_ATTR_WEIGHT] = &node_attr_weight.attr,
+ NULL,
+};
+
+struct clusters {
+ struct configfs_subsystem subsys;
+};
+
+struct cluster {
+ struct config_group group;
+};
+
+struct spaces {
+ struct config_group ss_group;
+};
+
+struct space {
+ struct config_group group;
+ struct list_head members;
+ struct mutex members_lock;
+ int members_count;
+};
+
+struct comms {
+ struct config_group cs_group;
+};
+
+struct comm {
+ struct config_item item;
+ int nodeid;
+ int local;
+ int addr_count;
+ struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
+};
+
+struct nodes {
+ struct config_group ns_group;
+};
+
+struct node {
+ struct config_item item;
+ struct list_head list; /* space->members */
+ int nodeid;
+ int weight;
+};
+
+static struct configfs_group_operations clusters_ops = {
+ .make_group = make_cluster,
+ .drop_item = drop_cluster,
+};
+
+static struct configfs_item_operations cluster_ops = {
+ .release = release_cluster,
+};
+
+static struct configfs_group_operations spaces_ops = {
+ .make_group = make_space,
+ .drop_item = drop_space,
+};
+
+static struct configfs_item_operations space_ops = {
+ .release = release_space,
+};
+
+static struct configfs_group_operations comms_ops = {
+ .make_item = make_comm,
+ .drop_item = drop_comm,
+};
+
+static struct configfs_item_operations comm_ops = {
+ .release = release_comm,
+ .show_attribute = show_comm,
+ .store_attribute = store_comm,
+};
+
+static struct configfs_group_operations nodes_ops = {
+ .make_item = make_node,
+ .drop_item = drop_node,
+};
+
+static struct configfs_item_operations node_ops = {
+ .release = release_node,
+ .show_attribute = show_node,
+ .store_attribute = store_node,
+};
+
+static struct config_item_type clusters_type = {
+ .ct_group_ops = &clusters_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type cluster_type = {
+ .ct_item_ops = &cluster_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type spaces_type = {
+ .ct_group_ops = &spaces_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type space_type = {
+ .ct_item_ops = &space_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type comms_type = {
+ .ct_group_ops = &comms_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type comm_type = {
+ .ct_item_ops = &comm_ops,
+ .ct_attrs = comm_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type nodes_type = {
+ .ct_group_ops = &nodes_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type node_type = {
+ .ct_item_ops = &node_ops,
+ .ct_attrs = node_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct cluster *to_cluster(struct config_item *i)
+{
+ return i ? container_of(to_config_group(i), struct cluster, group):NULL;
+}
+
+static struct space *to_space(struct config_item *i)
+{
+ return i ? container_of(to_config_group(i), struct space, group) : NULL;
+}
+
+static struct comm *to_comm(struct config_item *i)
+{
+ return i ? container_of(i, struct comm, item) : NULL;
+}
+
+static struct node *to_node(struct config_item *i)
+{
+ return i ? container_of(i, struct node, item) : NULL;
+}
+
+static struct config_group *make_cluster(struct config_group *g,
+ const char *name)
+{
+ struct cluster *cl = NULL;
+ struct spaces *sps = NULL;
+ struct comms *cms = NULL;
+ void *gps = NULL;
+
+ cl = kzalloc(sizeof(struct cluster), GFP_KERNEL);
+ gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
+ sps = kzalloc(sizeof(struct spaces), GFP_KERNEL);
+ cms = kzalloc(sizeof(struct comms), GFP_KERNEL);
+
+ if (!cl || !gps || !sps || !cms)
+ goto fail;
+
+ config_group_init_type_name(&cl->group, name, &cluster_type);
+ config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
+ config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
+
+ cl->group.default_groups = gps;
+ cl->group.default_groups[0] = &sps->ss_group;
+ cl->group.default_groups[1] = &cms->cs_group;
+ cl->group.default_groups[2] = NULL;
+
+ space_list = &sps->ss_group;
+ comm_list = &cms->cs_group;
+ return &cl->group;
+
+ fail:
+ kfree(cl);
+ kfree(gps);
+ kfree(sps);
+ kfree(cms);
+ return NULL;
+}
+
+static void drop_cluster(struct config_group *g, struct config_item *i)
+{
+ struct cluster *cl = to_cluster(i);
+ struct config_item *tmp;
+ int j;
+
+ for (j = 0; cl->group.default_groups[j]; j++) {
+ tmp = &cl->group.default_groups[j]->cg_item;
+ cl->group.default_groups[j] = NULL;
+ config_item_put(tmp);
+ }
+
+ space_list = NULL;
+ comm_list = NULL;
+
+ config_item_put(i);
+}
+
+static void release_cluster(struct config_item *i)
+{
+ struct cluster *cl = to_cluster(i);
+ kfree(cl->group.default_groups);
+ kfree(cl);
+}
+
+static struct config_group *make_space(struct config_group *g, const char *name)
+{
+ struct space *sp = NULL;
+ struct nodes *nds = NULL;
+ void *gps = NULL;
+
+ sp = kzalloc(sizeof(struct space), GFP_KERNEL);
+ gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
+ nds = kzalloc(sizeof(struct nodes), GFP_KERNEL);
+
+ if (!sp || !gps || !nds)
+ goto fail;
+
+ config_group_init_type_name(&sp->group, name, &space_type);
+ config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
+
+ sp->group.default_groups = gps;
+ sp->group.default_groups[0] = &nds->ns_group;
+ sp->group.default_groups[1] = NULL;
+
+ INIT_LIST_HEAD(&sp->members);
+ mutex_init(&sp->members_lock);
+ sp->members_count = 0;
+ return &sp->group;
+
+ fail:
+ kfree(sp);
+ kfree(gps);
+ kfree(nds);
+ return NULL;
+}
+
+static void drop_space(struct config_group *g, struct config_item *i)
+{
+ struct space *sp = to_space(i);
+ struct config_item *tmp;
+ int j;
+
+ /* assert list_empty(&sp->members) */
+
+ for (j = 0; sp->group.default_groups[j]; j++) {
+ tmp = &sp->group.default_groups[j]->cg_item;
+ sp->group.default_groups[j] = NULL;
+ config_item_put(tmp);
+ }
+
+ config_item_put(i);
+}
+
+static void release_space(struct config_item *i)
+{
+ struct space *sp = to_space(i);
+ kfree(sp->group.default_groups);
+ kfree(sp);
+}
+
+static struct config_item *make_comm(struct config_group *g, const char *name)
+{
+ struct comm *cm;
+
+ cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
+ if (!cm)
+ return NULL;
+
+ config_item_init_type_name(&cm->item, name, &comm_type);
+ cm->nodeid = -1;
+ cm->local = 0;
+ cm->addr_count = 0;
+ return &cm->item;
+}
+
+static void drop_comm(struct config_group *g, struct config_item *i)
+{
+ struct comm *cm = to_comm(i);
+ if (local_comm == cm)
+ local_comm = NULL;
+ dlm_lowcomms_close(cm->nodeid);
+ while (cm->addr_count--)
+ kfree(cm->addr[cm->addr_count]);
+ config_item_put(i);
+}
+
+static void release_comm(struct config_item *i)
+{
+ struct comm *cm = to_comm(i);
+ kfree(cm);
+}
+
+static struct config_item *make_node(struct config_group *g, const char *name)
+{
+ struct space *sp = to_space(g->cg_item.ci_parent);
+ struct node *nd;
+
+ nd = kzalloc(sizeof(struct node), GFP_KERNEL);
+ if (!nd)
+ return NULL;
+
+ config_item_init_type_name(&nd->item, name, &node_type);
+ nd->nodeid = -1;
+ nd->weight = 1; /* default weight of 1 if none is set */
+
+ mutex_lock(&sp->members_lock);
+ list_add(&nd->list, &sp->members);
+ sp->members_count++;
+ mutex_unlock(&sp->members_lock);
+
+ return &nd->item;
+}
+
+static void drop_node(struct config_group *g, struct config_item *i)
+{
+ struct space *sp = to_space(g->cg_item.ci_parent);
+ struct node *nd = to_node(i);
+
+ mutex_lock(&sp->members_lock);
+ list_del(&nd->list);
+ sp->members_count--;
+ mutex_unlock(&sp->members_lock);
+
+ config_item_put(i);
+}
+
+static void release_node(struct config_item *i)
+{
+ struct node *nd = to_node(i);
+ kfree(nd);
+}
+
+static struct clusters clusters_root = {
+ .subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "dlm",
+ .ci_type = &clusters_type,
+ },
+ },
+ },
+};
+
+int dlm_config_init(void)
+{
+ config_group_init(&clusters_root.subsys.su_group);
+ init_MUTEX(&clusters_root.subsys.su_sem);
+ return configfs_register_subsystem(&clusters_root.subsys);
+}
+
+void dlm_config_exit(void)
+{
+ configfs_unregister_subsystem(&clusters_root.subsys);
+}
+
+/*
+ * Functions for user space to read/write attributes
+ */
+
+static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
+ char *buf)
+{
+ struct comm *cm = to_comm(i);
+ struct comm_attribute *cma =
+ container_of(a, struct comm_attribute, attr);
+ return cma->show ? cma->show(cm, buf) : 0;
+}
+
+static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
+ const char *buf, size_t len)
+{
+ struct comm *cm = to_comm(i);
+ struct comm_attribute *cma =
+ container_of(a, struct comm_attribute, attr);
+ return cma->store ? cma->store(cm, buf, len) : -EINVAL;
+}
+
+static ssize_t comm_nodeid_read(struct comm *cm, char *buf)
+{
+ return sprintf(buf, "%d\n", cm->nodeid);
+}
+
+static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len)
+{
+ cm->nodeid = simple_strtol(buf, NULL, 0);
+ return len;
+}
+
+static ssize_t comm_local_read(struct comm *cm, char *buf)
+{
+ return sprintf(buf, "%d\n", cm->local);
+}
+
+static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
+{
+ cm->local= simple_strtol(buf, NULL, 0);
+ if (cm->local && !local_comm)
+ local_comm = cm;
+ return len;
+}
+
+static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
+{
+ struct sockaddr_storage *addr;
+
+ if (len != sizeof(struct sockaddr_storage))
+ return -EINVAL;
+
+ if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
+ return -ENOSPC;
+
+ addr = kzalloc(sizeof(*addr), GFP_KERNEL);
+ if (!addr)
+ return -ENOMEM;
+
+ memcpy(addr, buf, len);
+ cm->addr[cm->addr_count++] = addr;
+ return len;
+}
+
+static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
+ char *buf)
+{
+ struct node *nd = to_node(i);
+ struct node_attribute *nda =
+ container_of(a, struct node_attribute, attr);
+ return nda->show ? nda->show(nd, buf) : 0;
+}
+
+static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
+ const char *buf, size_t len)
+{
+ struct node *nd = to_node(i);
+ struct node_attribute *nda =
+ container_of(a, struct node_attribute, attr);
+ return nda->store ? nda->store(nd, buf, len) : -EINVAL;
+}
+
+static ssize_t node_nodeid_read(struct node *nd, char *buf)
+{
+ return sprintf(buf, "%d\n", nd->nodeid);
+}
+
+static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len)
+{
+ nd->nodeid = simple_strtol(buf, NULL, 0);
+ return len;
+}
+
+static ssize_t node_weight_read(struct node *nd, char *buf)
+{
+ return sprintf(buf, "%d\n", nd->weight);
+}
+
+static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
+{
+ nd->weight = simple_strtol(buf, NULL, 0);
+ return len;
+}
+
+/*
+ * Functions for the dlm to get the info that's been configured
+ */
+
+static struct space *get_space(char *name)
+{
+ if (!space_list)
+ return NULL;
+ return to_space(config_group_find_obj(space_list, name));
+}
+
+static void put_space(struct space *sp)
+{
+ config_item_put(&sp->group.cg_item);
+}
+
+static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
+{
+ struct config_item *i;
+ struct comm *cm = NULL;
+ int found = 0;
+
+ if (!comm_list)
+ return NULL;
+
+ down(&clusters_root.subsys.su_sem);
+
+ list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
+ cm = to_comm(i);
+
+ if (nodeid) {
+ if (cm->nodeid != nodeid)
+ continue;
+ found = 1;
+ break;
+ } else {
+ if (!cm->addr_count ||
+ memcmp(cm->addr[0], addr, sizeof(*addr)))
+ continue;
+ found = 1;
+ break;
+ }
+ }
+ up(&clusters_root.subsys.su_sem);
+
+ if (found)
+ config_item_get(i);
+ else
+ cm = NULL;
+ return cm;
+}
+
+static void put_comm(struct comm *cm)
+{
+ config_item_put(&cm->item);
+}
+
+/* caller must free mem */
+int dlm_nodeid_list(char *lsname, int **ids_out)
+{
+ struct space *sp;
+ struct node *nd;
+ int i = 0, rv = 0;
+ int *ids;
+
+ sp = get_space(lsname);
+ if (!sp)
+ return -EEXIST;
+
+ mutex_lock(&sp->members_lock);
+ if (!sp->members_count) {
+ rv = 0;
+ goto out;
+ }
+
+ ids = kcalloc(sp->members_count, sizeof(int), GFP_KERNEL);
+ if (!ids) {
+ rv = -ENOMEM;
+ goto out;
+ }
+
+ rv = sp->members_count;
+ list_for_each_entry(nd, &sp->members, list)
+ ids[i++] = nd->nodeid;
+
+ if (rv != i)
+ printk("bad nodeid count %d %d\n", rv, i);
+
+ *ids_out = ids;
+ out:
+ mutex_unlock(&sp->members_lock);
+ put_space(sp);
+ return rv;
+}
+
+int dlm_node_weight(char *lsname, int nodeid)
+{
+ struct space *sp;
+ struct node *nd;
+ int w = -EEXIST;
+
+ sp = get_space(lsname);
+ if (!sp)
+ goto out;
+
+ mutex_lock(&sp->members_lock);
+ list_for_each_entry(nd, &sp->members, list) {
+ if (nd->nodeid != nodeid)
+ continue;
+ w = nd->weight;
+ break;
+ }
+ mutex_unlock(&sp->members_lock);
+ put_space(sp);
+ out:
+ return w;
+}
+
+int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
+{
+ struct comm *cm = get_comm(nodeid, NULL);
+ if (!cm)
+ return -EEXIST;
+ if (!cm->addr_count)
+ return -ENOENT;
+ memcpy(addr, cm->addr[0], sizeof(*addr));
+ put_comm(cm);
+ return 0;
+}
+
+int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
+{
+ struct comm *cm = get_comm(0, addr);
+ if (!cm)
+ return -EEXIST;
+ *nodeid = cm->nodeid;
+ put_comm(cm);
+ return 0;
+}
+
+int dlm_our_nodeid(void)
+{
+ return local_comm ? local_comm->nodeid : 0;
+}
+
+/* num 0 is first addr, num 1 is second addr */
+int dlm_our_addr(struct sockaddr_storage *addr, int num)
+{
+ if (!local_comm)
+ return -1;
+ if (num + 1 > local_comm->addr_count)
+ return -1;
+ memcpy(addr, local_comm->addr[num], sizeof(*addr));
+ return 0;
+}
+
+/* Config file defaults */
+#define DEFAULT_TCP_PORT 21064
+#define DEFAULT_BUFFER_SIZE 4096
+#define DEFAULT_RSBTBL_SIZE 256
+#define DEFAULT_LKBTBL_SIZE 1024
+#define DEFAULT_DIRTBL_SIZE 512
+#define DEFAULT_RECOVER_TIMER 5
+#define DEFAULT_TOSS_SECS 10
+#define DEFAULT_SCAN_SECS 5
+
+struct dlm_config_info dlm_config = {
+ .tcp_port = DEFAULT_TCP_PORT,
+ .buffer_size = DEFAULT_BUFFER_SIZE,
+ .rsbtbl_size = DEFAULT_RSBTBL_SIZE,
+ .lkbtbl_size = DEFAULT_LKBTBL_SIZE,
+ .dirtbl_size = DEFAULT_DIRTBL_SIZE,
+ .recover_timer = DEFAULT_RECOVER_TIMER,
+ .toss_secs = DEFAULT_TOSS_SECS,
+ .scan_secs = DEFAULT_SCAN_SECS
+};
+
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
new file mode 100644
index 000000000000..9da7839958a9
--- /dev/null
+++ b/fs/dlm/config.h
@@ -0,0 +1,42 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __CONFIG_DOT_H__
+#define __CONFIG_DOT_H__
+
+#define DLM_MAX_ADDR_COUNT 3
+
+struct dlm_config_info {
+ int tcp_port;
+ int buffer_size;
+ int rsbtbl_size;
+ int lkbtbl_size;
+ int dirtbl_size;
+ int recover_timer;
+ int toss_secs;
+ int scan_secs;
+};
+
+extern struct dlm_config_info dlm_config;
+
+int dlm_config_init(void);
+void dlm_config_exit(void);
+int dlm_node_weight(char *lsname, int nodeid);
+int dlm_nodeid_list(char *lsname, int **ids_out);
+int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
+int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
+int dlm_our_nodeid(void);
+int dlm_our_addr(struct sockaddr_storage *addr, int num);
+
+#endif /* __CONFIG_DOT_H__ */
+
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
new file mode 100644
index 000000000000..ca94a837a5bb
--- /dev/null
+++ b/fs/dlm/debug_fs.c
@@ -0,0 +1,387 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+
+#include "dlm_internal.h"
+
+#define DLM_DEBUG_BUF_LEN 4096
+static char debug_buf[DLM_DEBUG_BUF_LEN];
+static struct mutex debug_buf_lock;
+
+static struct dentry *dlm_root;
+
+struct rsb_iter {
+ int entry;
+ struct dlm_ls *ls;
+ struct list_head *next;
+ struct dlm_rsb *rsb;
+};
+
+/*
+ * dump all rsb's in the lockspace hash table
+ */
+
+static char *print_lockmode(int mode)
+{
+ switch (mode) {
+ case DLM_LOCK_IV:
+ return "--";
+ case DLM_LOCK_NL:
+ return "NL";
+ case DLM_LOCK_CR:
+ return "CR";
+ case DLM_LOCK_CW:
+ return "CW";
+ case DLM_LOCK_PR:
+ return "PR";
+ case DLM_LOCK_PW:
+ return "PW";
+ case DLM_LOCK_EX:
+ return "EX";
+ default:
+ return "??";
+ }
+}
+
+static void print_lock(struct seq_file *s, struct dlm_lkb *lkb,
+ struct dlm_rsb *res)
+{
+ seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
+
+ if (lkb->lkb_status == DLM_LKSTS_CONVERT
+ || lkb->lkb_status == DLM_LKSTS_WAITING)
+ seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
+
+ if (lkb->lkb_nodeid) {
+ if (lkb->lkb_nodeid != res->res_nodeid)
+ seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
+ lkb->lkb_remid);
+ else
+ seq_printf(s, " Master: %08x", lkb->lkb_remid);
+ }
+
+ if (lkb->lkb_wait_type)
+ seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
+
+ seq_printf(s, "\n");
+}
+
+static int print_resource(struct dlm_rsb *res, struct seq_file *s)
+{
+ struct dlm_lkb *lkb;
+ int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
+
+ seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length);
+ for (i = 0; i < res->res_length; i++) {
+ if (isprint(res->res_name[i]))
+ seq_printf(s, "%c", res->res_name[i]);
+ else
+ seq_printf(s, "%c", '.');
+ }
+ if (res->res_nodeid > 0)
+ seq_printf(s, "\" \nLocal Copy, Master is node %d\n",
+ res->res_nodeid);
+ else if (res->res_nodeid == 0)
+ seq_printf(s, "\" \nMaster Copy\n");
+ else if (res->res_nodeid == -1)
+ seq_printf(s, "\" \nLooking up master (lkid %x)\n",
+ res->res_first_lkid);
+ else
+ seq_printf(s, "\" \nInvalid master %d\n", res->res_nodeid);
+
+ /* Print the LVB: */
+ if (res->res_lvbptr) {
+ seq_printf(s, "LVB: ");
+ for (i = 0; i < lvblen; i++) {
+ if (i == lvblen / 2)
+ seq_printf(s, "\n ");
+ seq_printf(s, "%02x ",
+ (unsigned char) res->res_lvbptr[i]);
+ }
+ if (rsb_flag(res, RSB_VALNOTVALID))
+ seq_printf(s, " (INVALID)");
+ seq_printf(s, "\n");
+ }
+
+ root_list = !list_empty(&res->res_root_list);
+ recover_list = !list_empty(&res->res_recover_list);
+
+ if (root_list || recover_list) {
+ seq_printf(s, "Recovery: root %d recover %d flags %lx "
+ "count %d\n", root_list, recover_list,
+ res->res_flags, res->res_recover_locks_count);
+ }
+
+ /* Print the locks attached to this resource */
+ seq_printf(s, "Granted Queue\n");
+ list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
+ print_lock(s, lkb, res);
+
+ seq_printf(s, "Conversion Queue\n");
+ list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
+ print_lock(s, lkb, res);
+
+ seq_printf(s, "Waiting Queue\n");
+ list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
+ print_lock(s, lkb, res);
+
+ if (list_empty(&res->res_lookup))
+ goto out;
+
+ seq_printf(s, "Lookup Queue\n");
+ list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) {
+ seq_printf(s, "%08x %s", lkb->lkb_id,
+ print_lockmode(lkb->lkb_rqmode));
+ if (lkb->lkb_wait_type)
+ seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
+ seq_printf(s, "\n");
+ }
+ out:
+ return 0;
+}
+
+static int rsb_iter_next(struct rsb_iter *ri)
+{
+ struct dlm_ls *ls = ri->ls;
+ int i;
+
+ if (!ri->next) {
+ top:
+ /* Find the next non-empty hash bucket */
+ for (i = ri->entry; i < ls->ls_rsbtbl_size; i++) {
+ read_lock(&ls->ls_rsbtbl[i].lock);
+ if (!list_empty(&ls->ls_rsbtbl[i].list)) {
+ ri->next = ls->ls_rsbtbl[i].list.next;
+ read_unlock(&ls->ls_rsbtbl[i].lock);
+ break;
+ }
+ read_unlock(&ls->ls_rsbtbl[i].lock);
+ }
+ ri->entry = i;
+
+ if (ri->entry >= ls->ls_rsbtbl_size)
+ return 1;
+ } else {
+ i = ri->entry;
+ read_lock(&ls->ls_rsbtbl[i].lock);
+ ri->next = ri->next->next;
+ if (ri->next->next == ls->ls_rsbtbl[i].list.next) {
+ /* End of list - move to next bucket */
+ ri->next = NULL;
+ ri->entry++;
+ read_unlock(&ls->ls_rsbtbl[i].lock);
+ goto top;
+ }
+ read_unlock(&ls->ls_rsbtbl[i].lock);
+ }
+ ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
+
+ return 0;
+}
+
+static void rsb_iter_free(struct rsb_iter *ri)
+{
+ kfree(ri);
+}
+
+static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
+{
+ struct rsb_iter *ri;
+
+ ri = kmalloc(sizeof *ri, GFP_KERNEL);
+ if (!ri)
+ return NULL;
+
+ ri->ls = ls;
+ ri->entry = 0;
+ ri->next = NULL;
+
+ if (rsb_iter_next(ri)) {
+ rsb_iter_free(ri);
+ return NULL;
+ }
+
+ return ri;
+}
+
+static void *rsb_seq_start(struct seq_file *file, loff_t *pos)
+{
+ struct rsb_iter *ri;
+ loff_t n = *pos;
+
+ ri = rsb_iter_init(file->private);
+ if (!ri)
+ return NULL;
+
+ while (n--) {
+ if (rsb_iter_next(ri)) {
+ rsb_iter_free(ri);
+ return NULL;
+ }
+ }
+
+ return ri;
+}
+
+static void *rsb_seq_next(struct seq_file *file, void *iter_ptr, loff_t *pos)
+{
+ struct rsb_iter *ri = iter_ptr;
+
+ (*pos)++;
+
+ if (rsb_iter_next(ri)) {
+ rsb_iter_free(ri);
+ return NULL;
+ }
+
+ return ri;
+}
+
+static void rsb_seq_stop(struct seq_file *file, void *iter_ptr)
+{
+ /* nothing for now */
+}
+
+static int rsb_seq_show(struct seq_file *file, void *iter_ptr)
+{
+ struct rsb_iter *ri = iter_ptr;
+
+ print_resource(ri->rsb, file);
+
+ return 0;
+}
+
+static struct seq_operations rsb_seq_ops = {
+ .start = rsb_seq_start,
+ .next = rsb_seq_next,
+ .stop = rsb_seq_stop,
+ .show = rsb_seq_show,
+};
+
+static int rsb_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int ret;
+
+ ret = seq_open(file, &rsb_seq_ops);
+ if (ret)
+ return ret;
+
+ seq = file->private_data;
+ seq->private = inode->i_private;
+
+ return 0;
+}
+
+static struct file_operations rsb_fops = {
+ .owner = THIS_MODULE,
+ .open = rsb_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
+/*
+ * dump lkb's on the ls_waiters list
+ */
+
+static int waiters_open(struct inode *inode, struct file *file)
+{
+ file->private_data = inode->i_private;
+ return 0;
+}
+
+static ssize_t waiters_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct dlm_ls *ls = file->private_data;
+ struct dlm_lkb *lkb;
+ size_t len = DLM_DEBUG_BUF_LEN, pos = 0, ret, rv;
+
+ mutex_lock(&debug_buf_lock);
+ mutex_lock(&ls->ls_waiters_mutex);
+ memset(debug_buf, 0, sizeof(debug_buf));
+
+ list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+ ret = snprintf(debug_buf + pos, len - pos, "%x %d %d %s\n",
+ lkb->lkb_id, lkb->lkb_wait_type,
+ lkb->lkb_nodeid, lkb->lkb_resource->res_name);
+ if (ret >= len - pos)
+ break;
+ pos += ret;
+ }
+ mutex_unlock(&ls->ls_waiters_mutex);
+
+ rv = simple_read_from_buffer(userbuf, count, ppos, debug_buf, pos);
+ mutex_unlock(&debug_buf_lock);
+ return rv;
+}
+
+static struct file_operations waiters_fops = {
+ .owner = THIS_MODULE,
+ .open = waiters_open,
+ .read = waiters_read
+};
+
+int dlm_create_debug_file(struct dlm_ls *ls)
+{
+ char name[DLM_LOCKSPACE_LEN+8];
+
+ ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name,
+ S_IFREG | S_IRUGO,
+ dlm_root,
+ ls,
+ &rsb_fops);
+ if (!ls->ls_debug_rsb_dentry)
+ return -ENOMEM;
+
+ memset(name, 0, sizeof(name));
+ snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
+
+ ls->ls_debug_waiters_dentry = debugfs_create_file(name,
+ S_IFREG | S_IRUGO,
+ dlm_root,
+ ls,
+ &waiters_fops);
+ if (!ls->ls_debug_waiters_dentry) {
+ debugfs_remove(ls->ls_debug_rsb_dentry);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+void dlm_delete_debug_file(struct dlm_ls *ls)
+{
+ if (ls->ls_debug_rsb_dentry)
+ debugfs_remove(ls->ls_debug_rsb_dentry);
+ if (ls->ls_debug_waiters_dentry)
+ debugfs_remove(ls->ls_debug_waiters_dentry);
+}
+
+int dlm_register_debugfs(void)
+{
+ mutex_init(&debug_buf_lock);
+ dlm_root = debugfs_create_dir("dlm", NULL);
+ return dlm_root ? 0 : -ENOMEM;
+}
+
+void dlm_unregister_debugfs(void)
+{
+ debugfs_remove(dlm_root);
+}
+
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
new file mode 100644
index 000000000000..46754553fdcc
--- /dev/null
+++ b/fs/dlm/dir.c
@@ -0,0 +1,423 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "lowcomms.h"
+#include "rcom.h"
+#include "config.h"
+#include "memory.h"
+#include "recover.h"
+#include "util.h"
+#include "lock.h"
+#include "dir.h"
+
+
+static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de)
+{
+ spin_lock(&ls->ls_recover_list_lock);
+ list_add(&de->list, &ls->ls_recover_list);
+ spin_unlock(&ls->ls_recover_list_lock);
+}
+
+static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
+{
+ int found = 0;
+ struct dlm_direntry *de;
+
+ spin_lock(&ls->ls_recover_list_lock);
+ list_for_each_entry(de, &ls->ls_recover_list, list) {
+ if (de->length == len) {
+ list_del(&de->list);
+ de->master_nodeid = 0;
+ memset(de->name, 0, len);
+ found = 1;
+ break;
+ }
+ }
+ spin_unlock(&ls->ls_recover_list_lock);
+
+ if (!found)
+ de = allocate_direntry(ls, len);
+ return de;
+}
+
+void dlm_clear_free_entries(struct dlm_ls *ls)
+{
+ struct dlm_direntry *de;
+
+ spin_lock(&ls->ls_recover_list_lock);
+ while (!list_empty(&ls->ls_recover_list)) {
+ de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
+ list);
+ list_del(&de->list);
+ free_direntry(de);
+ }
+ spin_unlock(&ls->ls_recover_list_lock);
+}
+
+/*
+ * We use the upper 16 bits of the hash value to select the directory node.
+ * Low bits are used for distribution of rsb's among hash buckets on each node.
+ *
+ * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
+ * num_nodes to the hash value. This value in the desired range is used as an
+ * offset into the sorted list of nodeid's to give the particular nodeid.
+ */
+
+int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash)
+{
+ struct list_head *tmp;
+ struct dlm_member *memb = NULL;
+ uint32_t node, n = 0;
+ int nodeid;
+
+ if (ls->ls_num_nodes == 1) {
+ nodeid = dlm_our_nodeid();
+ goto out;
+ }
+
+ if (ls->ls_node_array) {
+ node = (hash >> 16) % ls->ls_total_weight;
+ nodeid = ls->ls_node_array[node];
+ goto out;
+ }
+
+ /* make_member_array() failed to kmalloc ls_node_array... */
+
+ node = (hash >> 16) % ls->ls_num_nodes;
+
+ list_for_each(tmp, &ls->ls_nodes) {
+ if (n++ != node)
+ continue;
+ memb = list_entry(tmp, struct dlm_member, list);
+ break;
+ }
+
+ DLM_ASSERT(memb , printk("num_nodes=%u n=%u node=%u\n",
+ ls->ls_num_nodes, n, node););
+ nodeid = memb->nodeid;
+ out:
+ return nodeid;
+}
+
+int dlm_dir_nodeid(struct dlm_rsb *r)
+{
+ return dlm_hash2nodeid(r->res_ls, r->res_hash);
+}
+
+static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len)
+{
+ uint32_t val;
+
+ val = jhash(name, len, 0);
+ val &= (ls->ls_dirtbl_size - 1);
+
+ return val;
+}
+
+static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de)
+{
+ uint32_t bucket;
+
+ bucket = dir_hash(ls, de->name, de->length);
+ list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
+}
+
+static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name,
+ int namelen, uint32_t bucket)
+{
+ struct dlm_direntry *de;
+
+ list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) {
+ if (de->length == namelen && !memcmp(name, de->name, namelen))
+ goto out;
+ }
+ de = NULL;
+ out:
+ return de;
+}
+
+void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen)
+{
+ struct dlm_direntry *de;
+ uint32_t bucket;
+
+ bucket = dir_hash(ls, name, namelen);
+
+ write_lock(&ls->ls_dirtbl[bucket].lock);
+
+ de = search_bucket(ls, name, namelen, bucket);
+
+ if (!de) {
+ log_error(ls, "remove fr %u none", nodeid);
+ goto out;
+ }
+
+ if (de->master_nodeid != nodeid) {
+ log_error(ls, "remove fr %u ID %u", nodeid, de->master_nodeid);
+ goto out;
+ }
+
+ list_del(&de->list);
+ free_direntry(de);
+ out:
+ write_unlock(&ls->ls_dirtbl[bucket].lock);
+}
+
+void dlm_dir_clear(struct dlm_ls *ls)
+{
+ struct list_head *head;
+ struct dlm_direntry *de;
+ int i;
+
+ DLM_ASSERT(list_empty(&ls->ls_recover_list), );
+
+ for (i = 0; i < ls->ls_dirtbl_size; i++) {
+ write_lock(&ls->ls_dirtbl[i].lock);
+ head = &ls->ls_dirtbl[i].list;
+ while (!list_empty(head)) {
+ de = list_entry(head->next, struct dlm_direntry, list);
+ list_del(&de->list);
+ put_free_de(ls, de);
+ }
+ write_unlock(&ls->ls_dirtbl[i].lock);
+ }
+}
+
+int dlm_recover_directory(struct dlm_ls *ls)
+{
+ struct dlm_member *memb;
+ struct dlm_direntry *de;
+ char *b, *last_name = NULL;
+ int error = -ENOMEM, last_len, count = 0;
+ uint16_t namelen;
+
+ log_debug(ls, "dlm_recover_directory");
+
+ if (dlm_no_directory(ls))
+ goto out_status;
+
+ dlm_dir_clear(ls);
+
+ last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
+ if (!last_name)
+ goto out;
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ memset(last_name, 0, DLM_RESNAME_MAXLEN);
+ last_len = 0;
+
+ for (;;) {
+ error = dlm_recovery_stopped(ls);
+ if (error)
+ goto out_free;
+
+ error = dlm_rcom_names(ls, memb->nodeid,
+ last_name, last_len);
+ if (error)
+ goto out_free;
+
+ schedule();
+
+ /*
+ * pick namelen/name pairs out of received buffer
+ */
+
+ b = ls->ls_recover_buf + sizeof(struct dlm_rcom);
+
+ for (;;) {
+ memcpy(&namelen, b, sizeof(uint16_t));
+ namelen = be16_to_cpu(namelen);
+ b += sizeof(uint16_t);
+
+ /* namelen of 0xFFFFF marks end of names for
+ this node; namelen of 0 marks end of the
+ buffer */
+
+ if (namelen == 0xFFFF)
+ goto done;
+ if (!namelen)
+ break;
+
+ error = -ENOMEM;
+ de = get_free_de(ls, namelen);
+ if (!de)
+ goto out_free;
+
+ de->master_nodeid = memb->nodeid;
+ de->length = namelen;
+ last_len = namelen;
+ memcpy(de->name, b, namelen);
+ memcpy(last_name, b, namelen);
+ b += namelen;
+
+ add_entry_to_hash(ls, de);
+ count++;
+ }
+ }
+ done:
+ ;
+ }
+
+ out_status:
+ error = 0;
+ dlm_set_recover_status(ls, DLM_RS_DIR);
+ log_debug(ls, "dlm_recover_directory %d entries", count);
+ out_free:
+ kfree(last_name);
+ out:
+ dlm_clear_free_entries(ls);
+ return error;
+}
+
+static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
+ int namelen, int *r_nodeid)
+{
+ struct dlm_direntry *de, *tmp;
+ uint32_t bucket;
+
+ bucket = dir_hash(ls, name, namelen);
+
+ write_lock(&ls->ls_dirtbl[bucket].lock);
+ de = search_bucket(ls, name, namelen, bucket);
+ if (de) {
+ *r_nodeid = de->master_nodeid;
+ write_unlock(&ls->ls_dirtbl[bucket].lock);
+ if (*r_nodeid == nodeid)
+ return -EEXIST;
+ return 0;
+ }
+
+ write_unlock(&ls->ls_dirtbl[bucket].lock);
+
+ de = allocate_direntry(ls, namelen);
+ if (!de)
+ return -ENOMEM;
+
+ de->master_nodeid = nodeid;
+ de->length = namelen;
+ memcpy(de->name, name, namelen);
+
+ write_lock(&ls->ls_dirtbl[bucket].lock);
+ tmp = search_bucket(ls, name, namelen, bucket);
+ if (tmp) {
+ free_direntry(de);
+ de = tmp;
+ } else {
+ list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
+ }
+ *r_nodeid = de->master_nodeid;
+ write_unlock(&ls->ls_dirtbl[bucket].lock);
+ return 0;
+}
+
+int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
+ int *r_nodeid)
+{
+ return get_entry(ls, nodeid, name, namelen, r_nodeid);
+}
+
+/* Copy the names of master rsb's into the buffer provided.
+ Only select names whose dir node is the given nodeid. */
+
+void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
+ char *outbuf, int outlen, int nodeid)
+{
+ struct list_head *list;
+ struct dlm_rsb *start_r = NULL, *r = NULL;
+ int offset = 0, start_namelen, error, dir_nodeid;
+ char *start_name;
+ uint16_t be_namelen;
+
+ /*
+ * Find the rsb where we left off (or start again)
+ */
+
+ start_namelen = inlen;
+ start_name = inbuf;
+
+ if (start_namelen > 1) {
+ /*
+ * We could also use a find_rsb_root() function here that
+ * searched the ls_root_list.
+ */
+ error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER,
+ &start_r);
+ DLM_ASSERT(!error && start_r,
+ printk("error %d\n", error););
+ DLM_ASSERT(!list_empty(&start_r->res_root_list),
+ dlm_print_rsb(start_r););
+ dlm_put_rsb(start_r);
+ }
+
+ /*
+ * Send rsb names for rsb's we're master of and whose directory node
+ * matches the requesting node.
+ */
+
+ down_read(&ls->ls_root_sem);
+ if (start_r)
+ list = start_r->res_root_list.next;
+ else
+ list = ls->ls_root_list.next;
+
+ for (offset = 0; list != &ls->ls_root_list; list = list->next) {
+ r = list_entry(list, struct dlm_rsb, res_root_list);
+ if (r->res_nodeid)
+ continue;
+
+ dir_nodeid = dlm_dir_nodeid(r);
+ if (dir_nodeid != nodeid)
+ continue;
+
+ /*
+ * The block ends when we can't fit the following in the
+ * remaining buffer space:
+ * namelen (uint16_t) +
+ * name (r->res_length) +
+ * end-of-block record 0x0000 (uint16_t)
+ */
+
+ if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
+ /* Write end-of-block record */
+ be_namelen = 0;
+ memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+ offset += sizeof(uint16_t);
+ goto out;
+ }
+
+ be_namelen = cpu_to_be16(r->res_length);
+ memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+ offset += sizeof(uint16_t);
+ memcpy(outbuf + offset, r->res_name, r->res_length);
+ offset += r->res_length;
+ }
+
+ /*
+ * If we've reached the end of the list (and there's room) write a
+ * terminating record.
+ */
+
+ if ((list == &ls->ls_root_list) &&
+ (offset + sizeof(uint16_t) <= outlen)) {
+ be_namelen = 0xFFFF;
+ memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+ offset += sizeof(uint16_t);
+ }
+
+ out:
+ up_read(&ls->ls_root_sem);
+}
+
diff --git a/fs/dlm/dir.h b/fs/dlm/dir.h
new file mode 100644
index 000000000000..0b0eb1267b6e
--- /dev/null
+++ b/fs/dlm/dir.h
@@ -0,0 +1,30 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __DIR_DOT_H__
+#define __DIR_DOT_H__
+
+
+int dlm_dir_nodeid(struct dlm_rsb *rsb);
+int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash);
+void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int len);
+void dlm_dir_clear(struct dlm_ls *ls);
+void dlm_clear_free_entries(struct dlm_ls *ls);
+int dlm_recover_directory(struct dlm_ls *ls);
+int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
+ int *r_nodeid);
+void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
+ char *outbuf, int outlen, int nodeid);
+
+#endif /* __DIR_DOT_H__ */
+
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
new file mode 100644
index 000000000000..1e5cd67e1b7a
--- /dev/null
+++ b/fs/dlm/dlm_internal.h
@@ -0,0 +1,543 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __DLM_INTERNAL_DOT_H__
+#define __DLM_INTERNAL_DOT_H__
+
+/*
+ * This is the main header file to be included in each DLM source file.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/socket.h>
+#include <linux/kthread.h>
+#include <linux/kobject.h>
+#include <linux/kref.h>
+#include <linux/kernel.h>
+#include <linux/jhash.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <asm/semaphore.h>
+#include <asm/uaccess.h>
+
+#include <linux/dlm.h>
+
+#define DLM_LOCKSPACE_LEN 64
+
+/* Size of the temp buffer midcomms allocates on the stack.
+ We try to make this large enough so most messages fit.
+ FIXME: should sctp make this unnecessary? */
+
+#define DLM_INBUF_LEN 148
+
+struct dlm_ls;
+struct dlm_lkb;
+struct dlm_rsb;
+struct dlm_member;
+struct dlm_lkbtable;
+struct dlm_rsbtable;
+struct dlm_dirtable;
+struct dlm_direntry;
+struct dlm_recover;
+struct dlm_header;
+struct dlm_message;
+struct dlm_rcom;
+struct dlm_mhandle;
+
+#define log_print(fmt, args...) \
+ printk(KERN_ERR "dlm: "fmt"\n" , ##args)
+#define log_error(ls, fmt, args...) \
+ printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
+
+#define DLM_LOG_DEBUG
+#ifdef DLM_LOG_DEBUG
+#define log_debug(ls, fmt, args...) log_error(ls, fmt, ##args)
+#else
+#define log_debug(ls, fmt, args...)
+#endif
+
+#define DLM_ASSERT(x, do) \
+{ \
+ if (!(x)) \
+ { \
+ printk(KERN_ERR "\nDLM: Assertion failed on line %d of file %s\n" \
+ "DLM: assertion: \"%s\"\n" \
+ "DLM: time = %lu\n", \
+ __LINE__, __FILE__, #x, jiffies); \
+ {do} \
+ printk("\n"); \
+ BUG(); \
+ panic("DLM: Record message above and reboot.\n"); \
+ } \
+}
+
+#define DLM_FAKE_USER_AST ERR_PTR(-EINVAL)
+
+
+struct dlm_direntry {
+ struct list_head list;
+ uint32_t master_nodeid;
+ uint16_t length;
+ char name[1];
+};
+
+struct dlm_dirtable {
+ struct list_head list;
+ rwlock_t lock;
+};
+
+struct dlm_rsbtable {
+ struct list_head list;
+ struct list_head toss;
+ rwlock_t lock;
+};
+
+struct dlm_lkbtable {
+ struct list_head list;
+ rwlock_t lock;
+ uint16_t counter;
+};
+
+/*
+ * Lockspace member (per node in a ls)
+ */
+
+struct dlm_member {
+ struct list_head list;
+ int nodeid;
+ int weight;
+};
+
+/*
+ * Save and manage recovery state for a lockspace.
+ */
+
+struct dlm_recover {
+ struct list_head list;
+ int *nodeids;
+ int node_count;
+ uint64_t seq;
+};
+
+/*
+ * Pass input args to second stage locking function.
+ */
+
+struct dlm_args {
+ uint32_t flags;
+ void *astaddr;
+ long astparam;
+ void *bastaddr;
+ int mode;
+ struct dlm_lksb *lksb;
+};
+
+
+/*
+ * Lock block
+ *
+ * A lock can be one of three types:
+ *
+ * local copy lock is mastered locally
+ * (lkb_nodeid is zero and DLM_LKF_MSTCPY is not set)
+ * process copy lock is mastered on a remote node
+ * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is not set)
+ * master copy master node's copy of a lock owned by remote node
+ * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is set)
+ *
+ * lkb_exflags: a copy of the most recent flags arg provided to dlm_lock or
+ * dlm_unlock. The dlm does not modify these or use any private flags in
+ * this field; it only contains DLM_LKF_ flags from dlm.h. These flags
+ * are sent as-is to the remote master when the lock is remote.
+ *
+ * lkb_flags: internal dlm flags (DLM_IFL_ prefix) from dlm_internal.h.
+ * Some internal flags are shared between the master and process nodes;
+ * these shared flags are kept in the lower two bytes. One of these
+ * flags set on the master copy will be propagated to the process copy
+ * and v.v. Other internal flags are private to the master or process
+ * node (e.g. DLM_IFL_MSTCPY). These are kept in the high two bytes.
+ *
+ * lkb_sbflags: status block flags. These flags are copied directly into
+ * the caller's lksb.sb_flags prior to the dlm_lock/dlm_unlock completion
+ * ast. All defined in dlm.h with DLM_SBF_ prefix.
+ *
+ * lkb_status: the lock status indicates which rsb queue the lock is
+ * on, grant, convert, or wait. DLM_LKSTS_ WAITING/GRANTED/CONVERT
+ *
+ * lkb_wait_type: the dlm message type (DLM_MSG_ prefix) for which a
+ * reply is needed. Only set when the lkb is on the lockspace waiters
+ * list awaiting a reply from a remote node.
+ *
+ * lkb_nodeid: when the lkb is a local copy, nodeid is 0; when the lkb
+ * is a master copy, nodeid specifies the remote lock holder, when the
+ * lkb is a process copy, the nodeid specifies the lock master.
+ */
+
+/* lkb_ast_type */
+
+#define AST_COMP 1
+#define AST_BAST 2
+
+/* lkb_status */
+
+#define DLM_LKSTS_WAITING 1
+#define DLM_LKSTS_GRANTED 2
+#define DLM_LKSTS_CONVERT 3
+
+/* lkb_flags */
+
+#define DLM_IFL_MSTCPY 0x00010000
+#define DLM_IFL_RESEND 0x00020000
+#define DLM_IFL_DEAD 0x00040000
+#define DLM_IFL_USER 0x00000001
+#define DLM_IFL_ORPHAN 0x00000002
+
+struct dlm_lkb {
+ struct dlm_rsb *lkb_resource; /* the rsb */
+ struct kref lkb_ref;
+ int lkb_nodeid; /* copied from rsb */
+ int lkb_ownpid; /* pid of lock owner */
+ uint32_t lkb_id; /* our lock ID */
+ uint32_t lkb_remid; /* lock ID on remote partner */
+ uint32_t lkb_exflags; /* external flags from caller */
+ uint32_t lkb_sbflags; /* lksb flags */
+ uint32_t lkb_flags; /* internal flags */
+ uint32_t lkb_lvbseq; /* lvb sequence number */
+
+ int8_t lkb_status; /* granted, waiting, convert */
+ int8_t lkb_rqmode; /* requested lock mode */
+ int8_t lkb_grmode; /* granted lock mode */
+ int8_t lkb_bastmode; /* requested mode */
+ int8_t lkb_highbast; /* highest mode bast sent for */
+
+ int8_t lkb_wait_type; /* type of reply waiting for */
+ int8_t lkb_ast_type; /* type of ast queued for */
+
+ struct list_head lkb_idtbl_list; /* lockspace lkbtbl */
+ struct list_head lkb_statequeue; /* rsb g/c/w list */
+ struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */
+ struct list_head lkb_wait_reply; /* waiting for remote reply */
+ struct list_head lkb_astqueue; /* need ast to be sent */
+ struct list_head lkb_ownqueue; /* list of locks for a process */
+
+ char *lkb_lvbptr;
+ struct dlm_lksb *lkb_lksb; /* caller's status block */
+ void *lkb_astaddr; /* caller's ast function */
+ void *lkb_bastaddr; /* caller's bast function */
+ long lkb_astparam; /* caller's ast arg */
+};
+
+
+struct dlm_rsb {
+ struct dlm_ls *res_ls; /* the lockspace */
+ struct kref res_ref;
+ struct mutex res_mutex;
+ unsigned long res_flags;
+ int res_length; /* length of rsb name */
+ int res_nodeid;
+ uint32_t res_lvbseq;
+ uint32_t res_hash;
+ uint32_t res_bucket; /* rsbtbl */
+ unsigned long res_toss_time;
+ uint32_t res_first_lkid;
+ struct list_head res_lookup; /* lkbs waiting on first */
+ struct list_head res_hashchain; /* rsbtbl */
+ struct list_head res_grantqueue;
+ struct list_head res_convertqueue;
+ struct list_head res_waitqueue;
+
+ struct list_head res_root_list; /* used for recovery */
+ struct list_head res_recover_list; /* used for recovery */
+ int res_recover_locks_count;
+
+ char *res_lvbptr;
+ char res_name[1];
+};
+
+/* find_rsb() flags */
+
+#define R_MASTER 1 /* only return rsb if it's a master */
+#define R_CREATE 2 /* create/add rsb if not found */
+
+/* rsb_flags */
+
+enum rsb_flags {
+ RSB_MASTER_UNCERTAIN,
+ RSB_VALNOTVALID,
+ RSB_VALNOTVALID_PREV,
+ RSB_NEW_MASTER,
+ RSB_NEW_MASTER2,
+ RSB_RECOVER_CONVERT,
+ RSB_LOCKS_PURGED,
+};
+
+static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+ __set_bit(flag, &r->res_flags);
+}
+
+static inline void rsb_clear_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+ __clear_bit(flag, &r->res_flags);
+}
+
+static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+ return test_bit(flag, &r->res_flags);
+}
+
+
+/* dlm_header is first element of all structs sent between nodes */
+
+#define DLM_HEADER_MAJOR 0x00020000
+#define DLM_HEADER_MINOR 0x00000001
+
+#define DLM_MSG 1
+#define DLM_RCOM 2
+
+struct dlm_header {
+ uint32_t h_version;
+ uint32_t h_lockspace;
+ uint32_t h_nodeid; /* nodeid of sender */
+ uint16_t h_length;
+ uint8_t h_cmd; /* DLM_MSG, DLM_RCOM */
+ uint8_t h_pad;
+};
+
+
+#define DLM_MSG_REQUEST 1
+#define DLM_MSG_CONVERT 2
+#define DLM_MSG_UNLOCK 3
+#define DLM_MSG_CANCEL 4
+#define DLM_MSG_REQUEST_REPLY 5
+#define DLM_MSG_CONVERT_REPLY 6
+#define DLM_MSG_UNLOCK_REPLY 7
+#define DLM_MSG_CANCEL_REPLY 8
+#define DLM_MSG_GRANT 9
+#define DLM_MSG_BAST 10
+#define DLM_MSG_LOOKUP 11
+#define DLM_MSG_REMOVE 12
+#define DLM_MSG_LOOKUP_REPLY 13
+
+struct dlm_message {
+ struct dlm_header m_header;
+ uint32_t m_type; /* DLM_MSG_ */
+ uint32_t m_nodeid;
+ uint32_t m_pid;
+ uint32_t m_lkid; /* lkid on sender */
+ uint32_t m_remid; /* lkid on receiver */
+ uint32_t m_parent_lkid;
+ uint32_t m_parent_remid;
+ uint32_t m_exflags;
+ uint32_t m_sbflags;
+ uint32_t m_flags;
+ uint32_t m_lvbseq;
+ uint32_t m_hash;
+ int m_status;
+ int m_grmode;
+ int m_rqmode;
+ int m_bastmode;
+ int m_asts;
+ int m_result; /* 0 or -EXXX */
+ char m_extra[0]; /* name or lvb */
+};
+
+
+#define DLM_RS_NODES 0x00000001
+#define DLM_RS_NODES_ALL 0x00000002
+#define DLM_RS_DIR 0x00000004
+#define DLM_RS_DIR_ALL 0x00000008
+#define DLM_RS_LOCKS 0x00000010
+#define DLM_RS_LOCKS_ALL 0x00000020
+#define DLM_RS_DONE 0x00000040
+#define DLM_RS_DONE_ALL 0x00000080
+
+#define DLM_RCOM_STATUS 1
+#define DLM_RCOM_NAMES 2
+#define DLM_RCOM_LOOKUP 3
+#define DLM_RCOM_LOCK 4
+#define DLM_RCOM_STATUS_REPLY 5
+#define DLM_RCOM_NAMES_REPLY 6
+#define DLM_RCOM_LOOKUP_REPLY 7
+#define DLM_RCOM_LOCK_REPLY 8
+
+struct dlm_rcom {
+ struct dlm_header rc_header;
+ uint32_t rc_type; /* DLM_RCOM_ */
+ int rc_result; /* multi-purpose */
+ uint64_t rc_id; /* match reply with request */
+ char rc_buf[0];
+};
+
+struct rcom_config {
+ uint32_t rf_lvblen;
+ uint32_t rf_lsflags;
+ uint64_t rf_unused;
+};
+
+struct rcom_lock {
+ uint32_t rl_ownpid;
+ uint32_t rl_lkid;
+ uint32_t rl_remid;
+ uint32_t rl_parent_lkid;
+ uint32_t rl_parent_remid;
+ uint32_t rl_exflags;
+ uint32_t rl_flags;
+ uint32_t rl_lvbseq;
+ int rl_result;
+ int8_t rl_rqmode;
+ int8_t rl_grmode;
+ int8_t rl_status;
+ int8_t rl_asts;
+ uint16_t rl_wait_type;
+ uint16_t rl_namelen;
+ char rl_name[DLM_RESNAME_MAXLEN];
+ char rl_lvb[0];
+};
+
+struct dlm_ls {
+ struct list_head ls_list; /* list of lockspaces */
+ dlm_lockspace_t *ls_local_handle;
+ uint32_t ls_global_id; /* global unique lockspace ID */
+ uint32_t ls_exflags;
+ int ls_lvblen;
+ int ls_count; /* reference count */
+ unsigned long ls_flags; /* LSFL_ */
+ struct kobject ls_kobj;
+
+ struct dlm_rsbtable *ls_rsbtbl;
+ uint32_t ls_rsbtbl_size;
+
+ struct dlm_lkbtable *ls_lkbtbl;
+ uint32_t ls_lkbtbl_size;
+
+ struct dlm_dirtable *ls_dirtbl;
+ uint32_t ls_dirtbl_size;
+
+ struct mutex ls_waiters_mutex;
+ struct list_head ls_waiters; /* lkbs needing a reply */
+
+ struct list_head ls_nodes; /* current nodes in ls */
+ struct list_head ls_nodes_gone; /* dead node list, recovery */
+ int ls_num_nodes; /* number of nodes in ls */
+ int ls_low_nodeid;
+ int ls_total_weight;
+ int *ls_node_array;
+
+ struct dlm_rsb ls_stub_rsb; /* for returning errors */
+ struct dlm_lkb ls_stub_lkb; /* for returning errors */
+ struct dlm_message ls_stub_ms; /* for faking a reply */
+
+ struct dentry *ls_debug_rsb_dentry; /* debugfs */
+ struct dentry *ls_debug_waiters_dentry; /* debugfs */
+
+ wait_queue_head_t ls_uevent_wait; /* user part of join/leave */
+ int ls_uevent_result;
+
+ struct miscdevice ls_device;
+
+ /* recovery related */
+
+ struct timer_list ls_timer;
+ struct task_struct *ls_recoverd_task;
+ struct mutex ls_recoverd_active;
+ spinlock_t ls_recover_lock;
+ uint32_t ls_recover_status; /* DLM_RS_ */
+ uint64_t ls_recover_seq;
+ struct dlm_recover *ls_recover_args;
+ struct rw_semaphore ls_in_recovery; /* block local requests */
+ struct list_head ls_requestqueue;/* queue remote requests */
+ struct mutex ls_requestqueue_mutex;
+ char *ls_recover_buf;
+ int ls_recover_nodeid; /* for debugging */
+ uint64_t ls_rcom_seq;
+ struct list_head ls_recover_list;
+ spinlock_t ls_recover_list_lock;
+ int ls_recover_list_count;
+ wait_queue_head_t ls_wait_general;
+ struct mutex ls_clear_proc_locks;
+
+ struct list_head ls_root_list; /* root resources */
+ struct rw_semaphore ls_root_sem; /* protect root_list */
+
+ int ls_namelen;
+ char ls_name[1];
+};
+
+#define LSFL_WORK 0
+#define LSFL_RUNNING 1
+#define LSFL_RECOVERY_STOP 2
+#define LSFL_RCOM_READY 3
+#define LSFL_UEVENT_WAIT 4
+
+/* much of this is just saving user space pointers associated with the
+ lock that we pass back to the user lib with an ast */
+
+struct dlm_user_args {
+ struct dlm_user_proc *proc; /* each process that opens the lockspace
+ device has private data
+ (dlm_user_proc) on the struct file,
+ the process's locks point back to it*/
+ struct dlm_lksb lksb;
+ int old_mode;
+ int update_user_lvb;
+ struct dlm_lksb __user *user_lksb;
+ void __user *castparam;
+ void __user *castaddr;
+ void __user *bastparam;
+ void __user *bastaddr;
+};
+
+#define DLM_PROC_FLAGS_CLOSING 1
+#define DLM_PROC_FLAGS_COMPAT 2
+
+/* locks list is kept so we can remove all a process's locks when it
+ exits (or orphan those that are persistent) */
+
+struct dlm_user_proc {
+ dlm_lockspace_t *lockspace;
+ unsigned long flags; /* DLM_PROC_FLAGS */
+ struct list_head asts;
+ spinlock_t asts_spin;
+ struct list_head locks;
+ spinlock_t locks_spin;
+ wait_queue_head_t wait;
+};
+
+static inline int dlm_locking_stopped(struct dlm_ls *ls)
+{
+ return !test_bit(LSFL_RUNNING, &ls->ls_flags);
+}
+
+static inline int dlm_recovery_stopped(struct dlm_ls *ls)
+{
+ return test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+}
+
+static inline int dlm_no_directory(struct dlm_ls *ls)
+{
+ return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
+}
+
+#endif /* __DLM_INTERNAL_DOT_H__ */
+
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
new file mode 100644
index 000000000000..3f2befa4797b
--- /dev/null
+++ b/fs/dlm/lock.c
@@ -0,0 +1,3871 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/* Central locking logic has four stages:
+
+ dlm_lock()
+ dlm_unlock()
+
+ request_lock(ls, lkb)
+ convert_lock(ls, lkb)
+ unlock_lock(ls, lkb)
+ cancel_lock(ls, lkb)
+
+ _request_lock(r, lkb)
+ _convert_lock(r, lkb)
+ _unlock_lock(r, lkb)
+ _cancel_lock(r, lkb)
+
+ do_request(r, lkb)
+ do_convert(r, lkb)
+ do_unlock(r, lkb)
+ do_cancel(r, lkb)
+
+ Stage 1 (lock, unlock) is mainly about checking input args and
+ splitting into one of the four main operations:
+
+ dlm_lock = request_lock
+ dlm_lock+CONVERT = convert_lock
+ dlm_unlock = unlock_lock
+ dlm_unlock+CANCEL = cancel_lock
+
+ Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is
+ provided to the next stage.
+
+ Stage 3, _xxxx_lock(), determines if the operation is local or remote.
+ When remote, it calls send_xxxx(), when local it calls do_xxxx().
+
+ Stage 4, do_xxxx(), is the guts of the operation. It manipulates the
+ given rsb and lkb and queues callbacks.
+
+ For remote operations, send_xxxx() results in the corresponding do_xxxx()
+ function being executed on the remote node. The connecting send/receive
+ calls on local (L) and remote (R) nodes:
+
+ L: send_xxxx() -> R: receive_xxxx()
+ R: do_xxxx()
+ L: receive_xxxx_reply() <- R: send_xxxx_reply()
+*/
+#include <linux/types.h>
+#include "dlm_internal.h"
+#include <linux/dlm_device.h>
+#include "memory.h"
+#include "lowcomms.h"
+#include "requestqueue.h"
+#include "util.h"
+#include "dir.h"
+#include "member.h"
+#include "lockspace.h"
+#include "ast.h"
+#include "lock.h"
+#include "rcom.h"
+#include "recover.h"
+#include "lvb_table.h"
+#include "user.h"
+#include "config.h"
+
+static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode);
+static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_remove(struct dlm_rsb *r);
+static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ struct dlm_message *ms);
+static int receive_extralen(struct dlm_message *ms);
+
+/*
+ * Lock compatibilty matrix - thanks Steve
+ * UN = Unlocked state. Not really a state, used as a flag
+ * PD = Padding. Used to make the matrix a nice power of two in size
+ * Other states are the same as the VMS DLM.
+ * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same)
+ */
+
+static const int __dlm_compat_matrix[8][8] = {
+ /* UN NL CR CW PR PW EX PD */
+ {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */
+ {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */
+ {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */
+ {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */
+ {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */
+ {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */
+ {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */
+ {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
+};
+
+/*
+ * This defines the direction of transfer of LVB data.
+ * Granted mode is the row; requested mode is the column.
+ * Usage: matrix[grmode+1][rqmode+1]
+ * 1 = LVB is returned to the caller
+ * 0 = LVB is written to the resource
+ * -1 = nothing happens to the LVB
+ */
+
+const int dlm_lvb_operations[8][8] = {
+ /* UN NL CR CW PR PW EX PD*/
+ { -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */
+ { -1, 1, 1, 1, 1, 1, 1, 0 }, /* NL */
+ { -1, -1, 1, 1, 1, 1, 1, 0 }, /* CR */
+ { -1, -1, -1, 1, 1, 1, 1, 0 }, /* CW */
+ { -1, -1, -1, -1, 1, 1, 1, 0 }, /* PR */
+ { -1, 0, 0, 0, 0, 0, 1, 0 }, /* PW */
+ { -1, 0, 0, 0, 0, 0, 0, 0 }, /* EX */
+ { -1, 0, 0, 0, 0, 0, 0, 0 } /* PD */
+};
+
+#define modes_compat(gr, rq) \
+ __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
+
+int dlm_modes_compat(int mode1, int mode2)
+{
+ return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
+}
+
+/*
+ * Compatibility matrix for conversions with QUECVT set.
+ * Granted mode is the row; requested mode is the column.
+ * Usage: matrix[grmode+1][rqmode+1]
+ */
+
+static const int __quecvt_compat_matrix[8][8] = {
+ /* UN NL CR CW PR PW EX PD */
+ {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */
+ {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */
+ {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */
+ {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */
+ {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */
+ {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */
+ {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */
+ {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
+};
+
+void dlm_print_lkb(struct dlm_lkb *lkb)
+{
+ printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
+ " status %d rqmode %d grmode %d wait_type %d ast_type %d\n",
+ lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
+ lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
+ lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type);
+}
+
+void dlm_print_rsb(struct dlm_rsb *r)
+{
+ printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n",
+ r->res_nodeid, r->res_flags, r->res_first_lkid,
+ r->res_recover_locks_count, r->res_name);
+}
+
+void dlm_dump_rsb(struct dlm_rsb *r)
+{
+ struct dlm_lkb *lkb;
+
+ dlm_print_rsb(r);
+
+ printk(KERN_ERR "rsb: root_list empty %d recover_list empty %d\n",
+ list_empty(&r->res_root_list), list_empty(&r->res_recover_list));
+ printk(KERN_ERR "rsb lookup list\n");
+ list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup)
+ dlm_print_lkb(lkb);
+ printk(KERN_ERR "rsb grant queue:\n");
+ list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
+ dlm_print_lkb(lkb);
+ printk(KERN_ERR "rsb convert queue:\n");
+ list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
+ dlm_print_lkb(lkb);
+ printk(KERN_ERR "rsb wait queue:\n");
+ list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
+ dlm_print_lkb(lkb);
+}
+
+/* Threads cannot use the lockspace while it's being recovered */
+
+static inline void lock_recovery(struct dlm_ls *ls)
+{
+ down_read(&ls->ls_in_recovery);
+}
+
+static inline void unlock_recovery(struct dlm_ls *ls)
+{
+ up_read(&ls->ls_in_recovery);
+}
+
+static inline int lock_recovery_try(struct dlm_ls *ls)
+{
+ return down_read_trylock(&ls->ls_in_recovery);
+}
+
+static inline int can_be_queued(struct dlm_lkb *lkb)
+{
+ return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE);
+}
+
+static inline int force_blocking_asts(struct dlm_lkb *lkb)
+{
+ return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST);
+}
+
+static inline int is_demoted(struct dlm_lkb *lkb)
+{
+ return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
+}
+
+static inline int is_remote(struct dlm_rsb *r)
+{
+ DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
+ return !!r->res_nodeid;
+}
+
+static inline int is_process_copy(struct dlm_lkb *lkb)
+{
+ return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY));
+}
+
+static inline int is_master_copy(struct dlm_lkb *lkb)
+{
+ if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+ DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb););
+ return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0;
+}
+
+static inline int middle_conversion(struct dlm_lkb *lkb)
+{
+ if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) ||
+ (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW))
+ return 1;
+ return 0;
+}
+
+static inline int down_conversion(struct dlm_lkb *lkb)
+{
+ return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
+}
+
+static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+ if (is_master_copy(lkb))
+ return;
+
+ DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
+
+ lkb->lkb_lksb->sb_status = rv;
+ lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
+
+ dlm_add_ast(lkb, AST_COMP);
+}
+
+static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
+{
+ if (is_master_copy(lkb))
+ send_bast(r, lkb, rqmode);
+ else {
+ lkb->lkb_bastmode = rqmode;
+ dlm_add_ast(lkb, AST_BAST);
+ }
+}
+
+/*
+ * Basic operations on rsb's and lkb's
+ */
+
+static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
+{
+ struct dlm_rsb *r;
+
+ r = allocate_rsb(ls, len);
+ if (!r)
+ return NULL;
+
+ r->res_ls = ls;
+ r->res_length = len;
+ memcpy(r->res_name, name, len);
+ mutex_init(&r->res_mutex);
+
+ INIT_LIST_HEAD(&r->res_lookup);
+ INIT_LIST_HEAD(&r->res_grantqueue);
+ INIT_LIST_HEAD(&r->res_convertqueue);
+ INIT_LIST_HEAD(&r->res_waitqueue);
+ INIT_LIST_HEAD(&r->res_root_list);
+ INIT_LIST_HEAD(&r->res_recover_list);
+
+ return r;
+}
+
+static int search_rsb_list(struct list_head *head, char *name, int len,
+ unsigned int flags, struct dlm_rsb **r_ret)
+{
+ struct dlm_rsb *r;
+ int error = 0;
+
+ list_for_each_entry(r, head, res_hashchain) {
+ if (len == r->res_length && !memcmp(name, r->res_name, len))
+ goto found;
+ }
+ return -EBADR;
+
+ found:
+ if (r->res_nodeid && (flags & R_MASTER))
+ error = -ENOTBLK;
+ *r_ret = r;
+ return error;
+}
+
+static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
+ unsigned int flags, struct dlm_rsb **r_ret)
+{
+ struct dlm_rsb *r;
+ int error;
+
+ error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
+ if (!error) {
+ kref_get(&r->res_ref);
+ goto out;
+ }
+ error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
+ if (error)
+ goto out;
+
+ list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
+
+ if (dlm_no_directory(ls))
+ goto out;
+
+ if (r->res_nodeid == -1) {
+ rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
+ r->res_first_lkid = 0;
+ } else if (r->res_nodeid > 0) {
+ rsb_set_flag(r, RSB_MASTER_UNCERTAIN);
+ r->res_first_lkid = 0;
+ } else {
+ DLM_ASSERT(r->res_nodeid == 0, dlm_print_rsb(r););
+ DLM_ASSERT(!rsb_flag(r, RSB_MASTER_UNCERTAIN),);
+ }
+ out:
+ *r_ret = r;
+ return error;
+}
+
+static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
+ unsigned int flags, struct dlm_rsb **r_ret)
+{
+ int error;
+ write_lock(&ls->ls_rsbtbl[b].lock);
+ error = _search_rsb(ls, name, len, b, flags, r_ret);
+ write_unlock(&ls->ls_rsbtbl[b].lock);
+ return error;
+}
+
+/*
+ * Find rsb in rsbtbl and potentially create/add one
+ *
+ * Delaying the release of rsb's has a similar benefit to applications keeping
+ * NL locks on an rsb, but without the guarantee that the cached master value
+ * will still be valid when the rsb is reused. Apps aren't always smart enough
+ * to keep NL locks on an rsb that they may lock again shortly; this can lead
+ * to excessive master lookups and removals if we don't delay the release.
+ *
+ * Searching for an rsb means looking through both the normal list and toss
+ * list. When found on the toss list the rsb is moved to the normal list with
+ * ref count of 1; when found on normal list the ref count is incremented.
+ */
+
+static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
+ unsigned int flags, struct dlm_rsb **r_ret)
+{
+ struct dlm_rsb *r, *tmp;
+ uint32_t hash, bucket;
+ int error = 0;
+
+ if (dlm_no_directory(ls))
+ flags |= R_CREATE;
+
+ hash = jhash(name, namelen, 0);
+ bucket = hash & (ls->ls_rsbtbl_size - 1);
+
+ error = search_rsb(ls, name, namelen, bucket, flags, &r);
+ if (!error)
+ goto out;
+
+ if (error == -EBADR && !(flags & R_CREATE))
+ goto out;
+
+ /* the rsb was found but wasn't a master copy */
+ if (error == -ENOTBLK)
+ goto out;
+
+ error = -ENOMEM;
+ r = create_rsb(ls, name, namelen);
+ if (!r)
+ goto out;
+
+ r->res_hash = hash;
+ r->res_bucket = bucket;
+ r->res_nodeid = -1;
+ kref_init(&r->res_ref);
+
+ /* With no directory, the master can be set immediately */
+ if (dlm_no_directory(ls)) {
+ int nodeid = dlm_dir_nodeid(r);
+ if (nodeid == dlm_our_nodeid())
+ nodeid = 0;
+ r->res_nodeid = nodeid;
+ }
+
+ write_lock(&ls->ls_rsbtbl[bucket].lock);
+ error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
+ if (!error) {
+ write_unlock(&ls->ls_rsbtbl[bucket].lock);
+ free_rsb(r);
+ r = tmp;
+ goto out;
+ }
+ list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
+ write_unlock(&ls->ls_rsbtbl[bucket].lock);
+ error = 0;
+ out:
+ *r_ret = r;
+ return error;
+}
+
+int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
+ unsigned int flags, struct dlm_rsb **r_ret)
+{
+ return find_rsb(ls, name, namelen, flags, r_ret);
+}
+
+/* This is only called to add a reference when the code already holds
+ a valid reference to the rsb, so there's no need for locking. */
+
+static inline void hold_rsb(struct dlm_rsb *r)
+{
+ kref_get(&r->res_ref);
+}
+
+void dlm_hold_rsb(struct dlm_rsb *r)
+{
+ hold_rsb(r);
+}
+
+static void toss_rsb(struct kref *kref)
+{
+ struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
+ struct dlm_ls *ls = r->res_ls;
+
+ DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
+ kref_init(&r->res_ref);
+ list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
+ r->res_toss_time = jiffies;
+ if (r->res_lvbptr) {
+ free_lvb(r->res_lvbptr);
+ r->res_lvbptr = NULL;
+ }
+}
+
+/* When all references to the rsb are gone it's transfered to
+ the tossed list for later disposal. */
+
+static void put_rsb(struct dlm_rsb *r)
+{
+ struct dlm_ls *ls = r->res_ls;
+ uint32_t bucket = r->res_bucket;
+
+ write_lock(&ls->ls_rsbtbl[bucket].lock);
+ kref_put(&r->res_ref, toss_rsb);
+ write_unlock(&ls->ls_rsbtbl[bucket].lock);
+}
+
+void dlm_put_rsb(struct dlm_rsb *r)
+{
+ put_rsb(r);
+}
+
+/* See comment for unhold_lkb */
+
+static void unhold_rsb(struct dlm_rsb *r)
+{
+ int rv;
+ rv = kref_put(&r->res_ref, toss_rsb);
+ DLM_ASSERT(!rv, dlm_dump_rsb(r););
+}
+
+static void kill_rsb(struct kref *kref)
+{
+ struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
+
+ /* All work is done after the return from kref_put() so we
+ can release the write_lock before the remove and free. */
+
+ DLM_ASSERT(list_empty(&r->res_lookup), dlm_dump_rsb(r););
+ DLM_ASSERT(list_empty(&r->res_grantqueue), dlm_dump_rsb(r););
+ DLM_ASSERT(list_empty(&r->res_convertqueue), dlm_dump_rsb(r););
+ DLM_ASSERT(list_empty(&r->res_waitqueue), dlm_dump_rsb(r););
+ DLM_ASSERT(list_empty(&r->res_root_list), dlm_dump_rsb(r););
+ DLM_ASSERT(list_empty(&r->res_recover_list), dlm_dump_rsb(r););
+}
+
+/* Attaching/detaching lkb's from rsb's is for rsb reference counting.
+ The rsb must exist as long as any lkb's for it do. */
+
+static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ hold_rsb(r);
+ lkb->lkb_resource = r;
+}
+
+static void detach_lkb(struct dlm_lkb *lkb)
+{
+ if (lkb->lkb_resource) {
+ put_rsb(lkb->lkb_resource);
+ lkb->lkb_resource = NULL;
+ }
+}
+
+static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
+{
+ struct dlm_lkb *lkb, *tmp;
+ uint32_t lkid = 0;
+ uint16_t bucket;
+
+ lkb = allocate_lkb(ls);
+ if (!lkb)
+ return -ENOMEM;
+
+ lkb->lkb_nodeid = -1;
+ lkb->lkb_grmode = DLM_LOCK_IV;
+ kref_init(&lkb->lkb_ref);
+ INIT_LIST_HEAD(&lkb->lkb_ownqueue);
+
+ get_random_bytes(&bucket, sizeof(bucket));
+ bucket &= (ls->ls_lkbtbl_size - 1);
+
+ write_lock(&ls->ls_lkbtbl[bucket].lock);
+
+ /* counter can roll over so we must verify lkid is not in use */
+
+ while (lkid == 0) {
+ lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
+
+ list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list,
+ lkb_idtbl_list) {
+ if (tmp->lkb_id != lkid)
+ continue;
+ lkid = 0;
+ break;
+ }
+ }
+
+ lkb->lkb_id = lkid;
+ list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
+ write_unlock(&ls->ls_lkbtbl[bucket].lock);
+
+ *lkb_ret = lkb;
+ return 0;
+}
+
+static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
+{
+ uint16_t bucket = lkid & 0xFFFF;
+ struct dlm_lkb *lkb;
+
+ list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) {
+ if (lkb->lkb_id == lkid)
+ return lkb;
+ }
+ return NULL;
+}
+
+static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
+{
+ struct dlm_lkb *lkb;
+ uint16_t bucket = lkid & 0xFFFF;
+
+ if (bucket >= ls->ls_lkbtbl_size)
+ return -EBADSLT;
+
+ read_lock(&ls->ls_lkbtbl[bucket].lock);
+ lkb = __find_lkb(ls, lkid);
+ if (lkb)
+ kref_get(&lkb->lkb_ref);
+ read_unlock(&ls->ls_lkbtbl[bucket].lock);
+
+ *lkb_ret = lkb;
+ return lkb ? 0 : -ENOENT;
+}
+
+static void kill_lkb(struct kref *kref)
+{
+ struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
+
+ /* All work is done after the return from kref_put() so we
+ can release the write_lock before the detach_lkb */
+
+ DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
+}
+
+/* __put_lkb() is used when an lkb may not have an rsb attached to
+ it so we need to provide the lockspace explicitly */
+
+static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+ uint16_t bucket = lkb->lkb_id & 0xFFFF;
+
+ write_lock(&ls->ls_lkbtbl[bucket].lock);
+ if (kref_put(&lkb->lkb_ref, kill_lkb)) {
+ list_del(&lkb->lkb_idtbl_list);
+ write_unlock(&ls->ls_lkbtbl[bucket].lock);
+
+ detach_lkb(lkb);
+
+ /* for local/process lkbs, lvbptr points to caller's lksb */
+ if (lkb->lkb_lvbptr && is_master_copy(lkb))
+ free_lvb(lkb->lkb_lvbptr);
+ free_lkb(lkb);
+ return 1;
+ } else {
+ write_unlock(&ls->ls_lkbtbl[bucket].lock);
+ return 0;
+ }
+}
+
+int dlm_put_lkb(struct dlm_lkb *lkb)
+{
+ struct dlm_ls *ls;
+
+ DLM_ASSERT(lkb->lkb_resource, dlm_print_lkb(lkb););
+ DLM_ASSERT(lkb->lkb_resource->res_ls, dlm_print_lkb(lkb););
+
+ ls = lkb->lkb_resource->res_ls;
+ return __put_lkb(ls, lkb);
+}
+
+/* This is only called to add a reference when the code already holds
+ a valid reference to the lkb, so there's no need for locking. */
+
+static inline void hold_lkb(struct dlm_lkb *lkb)
+{
+ kref_get(&lkb->lkb_ref);
+}
+
+/* This is called when we need to remove a reference and are certain
+ it's not the last ref. e.g. del_lkb is always called between a
+ find_lkb/put_lkb and is always the inverse of a previous add_lkb.
+ put_lkb would work fine, but would involve unnecessary locking */
+
+static inline void unhold_lkb(struct dlm_lkb *lkb)
+{
+ int rv;
+ rv = kref_put(&lkb->lkb_ref, kill_lkb);
+ DLM_ASSERT(!rv, dlm_print_lkb(lkb););
+}
+
+static void lkb_add_ordered(struct list_head *new, struct list_head *head,
+ int mode)
+{
+ struct dlm_lkb *lkb = NULL;
+
+ list_for_each_entry(lkb, head, lkb_statequeue)
+ if (lkb->lkb_rqmode < mode)
+ break;
+
+ if (!lkb)
+ list_add_tail(new, head);
+ else
+ __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
+}
+
+/* add/remove lkb to rsb's grant/convert/wait queue */
+
+static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
+{
+ kref_get(&lkb->lkb_ref);
+
+ DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
+
+ lkb->lkb_status = status;
+
+ switch (status) {
+ case DLM_LKSTS_WAITING:
+ if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
+ list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
+ else
+ list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
+ break;
+ case DLM_LKSTS_GRANTED:
+ /* convention says granted locks kept in order of grmode */
+ lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
+ lkb->lkb_grmode);
+ break;
+ case DLM_LKSTS_CONVERT:
+ if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
+ list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
+ else
+ list_add_tail(&lkb->lkb_statequeue,
+ &r->res_convertqueue);
+ break;
+ default:
+ DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status););
+ }
+}
+
+static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ lkb->lkb_status = 0;
+ list_del(&lkb->lkb_statequeue);
+ unhold_lkb(lkb);
+}
+
+static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
+{
+ hold_lkb(lkb);
+ del_lkb(r, lkb);
+ add_lkb(r, lkb, sts);
+ unhold_lkb(lkb);
+}
+
+/* add/remove lkb from global waiters list of lkb's waiting for
+ a reply from a remote node */
+
+static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
+{
+ struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+
+ mutex_lock(&ls->ls_waiters_mutex);
+ if (lkb->lkb_wait_type) {
+ log_print("add_to_waiters error %d", lkb->lkb_wait_type);
+ goto out;
+ }
+ lkb->lkb_wait_type = mstype;
+ kref_get(&lkb->lkb_ref);
+ list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
+ out:
+ mutex_unlock(&ls->ls_waiters_mutex);
+}
+
+static int _remove_from_waiters(struct dlm_lkb *lkb)
+{
+ int error = 0;
+
+ if (!lkb->lkb_wait_type) {
+ log_print("remove_from_waiters error");
+ error = -EINVAL;
+ goto out;
+ }
+ lkb->lkb_wait_type = 0;
+ list_del(&lkb->lkb_wait_reply);
+ unhold_lkb(lkb);
+ out:
+ return error;
+}
+
+static int remove_from_waiters(struct dlm_lkb *lkb)
+{
+ struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+ int error;
+
+ mutex_lock(&ls->ls_waiters_mutex);
+ error = _remove_from_waiters(lkb);
+ mutex_unlock(&ls->ls_waiters_mutex);
+ return error;
+}
+
+static void dir_remove(struct dlm_rsb *r)
+{
+ int to_nodeid;
+
+ if (dlm_no_directory(r->res_ls))
+ return;
+
+ to_nodeid = dlm_dir_nodeid(r);
+ if (to_nodeid != dlm_our_nodeid())
+ send_remove(r);
+ else
+ dlm_dir_remove_entry(r->res_ls, to_nodeid,
+ r->res_name, r->res_length);
+}
+
+/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is
+ found since they are in order of newest to oldest? */
+
+static int shrink_bucket(struct dlm_ls *ls, int b)
+{
+ struct dlm_rsb *r;
+ int count = 0, found;
+
+ for (;;) {
+ found = 0;
+ write_lock(&ls->ls_rsbtbl[b].lock);
+ list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
+ res_hashchain) {
+ if (!time_after_eq(jiffies, r->res_toss_time +
+ dlm_config.toss_secs * HZ))
+ continue;
+ found = 1;
+ break;
+ }
+
+ if (!found) {
+ write_unlock(&ls->ls_rsbtbl[b].lock);
+ break;
+ }
+
+ if (kref_put(&r->res_ref, kill_rsb)) {
+ list_del(&r->res_hashchain);
+ write_unlock(&ls->ls_rsbtbl[b].lock);
+
+ if (is_master(r))
+ dir_remove(r);
+ free_rsb(r);
+ count++;
+ } else {
+ write_unlock(&ls->ls_rsbtbl[b].lock);
+ log_error(ls, "tossed rsb in use %s", r->res_name);
+ }
+ }
+
+ return count;
+}
+
+void dlm_scan_rsbs(struct dlm_ls *ls)
+{
+ int i;
+
+ if (dlm_locking_stopped(ls))
+ return;
+
+ for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+ shrink_bucket(ls, i);
+ cond_resched();
+ }
+}
+
+/* lkb is master or local copy */
+
+static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int b, len = r->res_ls->ls_lvblen;
+
+ /* b=1 lvb returned to caller
+ b=0 lvb written to rsb or invalidated
+ b=-1 do nothing */
+
+ b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
+
+ if (b == 1) {
+ if (!lkb->lkb_lvbptr)
+ return;
+
+ if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ return;
+
+ if (!r->res_lvbptr)
+ return;
+
+ memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len);
+ lkb->lkb_lvbseq = r->res_lvbseq;
+
+ } else if (b == 0) {
+ if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
+ rsb_set_flag(r, RSB_VALNOTVALID);
+ return;
+ }
+
+ if (!lkb->lkb_lvbptr)
+ return;
+
+ if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ return;
+
+ if (!r->res_lvbptr)
+ r->res_lvbptr = allocate_lvb(r->res_ls);
+
+ if (!r->res_lvbptr)
+ return;
+
+ memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len);
+ r->res_lvbseq++;
+ lkb->lkb_lvbseq = r->res_lvbseq;
+ rsb_clear_flag(r, RSB_VALNOTVALID);
+ }
+
+ if (rsb_flag(r, RSB_VALNOTVALID))
+ lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID;
+}
+
+static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ if (lkb->lkb_grmode < DLM_LOCK_PW)
+ return;
+
+ if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
+ rsb_set_flag(r, RSB_VALNOTVALID);
+ return;
+ }
+
+ if (!lkb->lkb_lvbptr)
+ return;
+
+ if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ return;
+
+ if (!r->res_lvbptr)
+ r->res_lvbptr = allocate_lvb(r->res_ls);
+
+ if (!r->res_lvbptr)
+ return;
+
+ memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+ r->res_lvbseq++;
+ rsb_clear_flag(r, RSB_VALNOTVALID);
+}
+
+/* lkb is process copy (pc) */
+
+static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ int b;
+
+ if (!lkb->lkb_lvbptr)
+ return;
+
+ if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ return;
+
+ b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
+ if (b == 1) {
+ int len = receive_extralen(ms);
+ memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
+ lkb->lkb_lvbseq = ms->m_lvbseq;
+ }
+}
+
+/* Manipulate lkb's on rsb's convert/granted/waiting queues
+ remove_lock -- used for unlock, removes lkb from granted
+ revert_lock -- used for cancel, moves lkb from convert to granted
+ grant_lock -- used for request and convert, adds lkb to granted or
+ moves lkb from convert or waiting to granted
+
+ Each of these is used for master or local copy lkb's. There is
+ also a _pc() variation used to make the corresponding change on
+ a process copy (pc) lkb. */
+
+static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ del_lkb(r, lkb);
+ lkb->lkb_grmode = DLM_LOCK_IV;
+ /* this unhold undoes the original ref from create_lkb()
+ so this leads to the lkb being freed */
+ unhold_lkb(lkb);
+}
+
+static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ set_lvb_unlock(r, lkb);
+ _remove_lock(r, lkb);
+}
+
+static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ _remove_lock(r, lkb);
+}
+
+static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ lkb->lkb_rqmode = DLM_LOCK_IV;
+
+ switch (lkb->lkb_status) {
+ case DLM_LKSTS_GRANTED:
+ break;
+ case DLM_LKSTS_CONVERT:
+ move_lkb(r, lkb, DLM_LKSTS_GRANTED);
+ break;
+ case DLM_LKSTS_WAITING:
+ del_lkb(r, lkb);
+ lkb->lkb_grmode = DLM_LOCK_IV;
+ /* this unhold undoes the original ref from create_lkb()
+ so this leads to the lkb being freed */
+ unhold_lkb(lkb);
+ break;
+ default:
+ log_print("invalid status for revert %d", lkb->lkb_status);
+ }
+}
+
+static void revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ revert_lock(r, lkb);
+}
+
+static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ if (lkb->lkb_grmode != lkb->lkb_rqmode) {
+ lkb->lkb_grmode = lkb->lkb_rqmode;
+ if (lkb->lkb_status)
+ move_lkb(r, lkb, DLM_LKSTS_GRANTED);
+ else
+ add_lkb(r, lkb, DLM_LKSTS_GRANTED);
+ }
+
+ lkb->lkb_rqmode = DLM_LOCK_IV;
+}
+
+static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ set_lvb_lock(r, lkb);
+ _grant_lock(r, lkb);
+ lkb->lkb_highbast = 0;
+}
+
+static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ set_lvb_lock_pc(r, lkb, ms);
+ _grant_lock(r, lkb);
+}
+
+/* called by grant_pending_locks() which means an async grant message must
+ be sent to the requesting node in addition to granting the lock if the
+ lkb belongs to a remote node. */
+
+static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ grant_lock(r, lkb);
+ if (is_master_copy(lkb))
+ send_grant(r, lkb);
+ else
+ queue_cast(r, lkb, 0);
+}
+
+static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
+{
+ struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
+ lkb_statequeue);
+ if (lkb->lkb_id == first->lkb_id)
+ return 1;
+
+ return 0;
+}
+
+/* Check if the given lkb conflicts with another lkb on the queue. */
+
+static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
+{
+ struct dlm_lkb *this;
+
+ list_for_each_entry(this, head, lkb_statequeue) {
+ if (this == lkb)
+ continue;
+ if (!modes_compat(this, lkb))
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * "A conversion deadlock arises with a pair of lock requests in the converting
+ * queue for one resource. The granted mode of each lock blocks the requested
+ * mode of the other lock."
+ *
+ * Part 2: if the granted mode of lkb is preventing the first lkb in the
+ * convert queue from being granted, then demote lkb (set grmode to NL).
+ * This second form requires that we check for conv-deadlk even when
+ * now == 0 in _can_be_granted().
+ *
+ * Example:
+ * Granted Queue: empty
+ * Convert Queue: NL->EX (first lock)
+ * PR->EX (second lock)
+ *
+ * The first lock can't be granted because of the granted mode of the second
+ * lock and the second lock can't be granted because it's not first in the
+ * list. We demote the granted mode of the second lock (the lkb passed to this
+ * function).
+ *
+ * After the resolution, the "grant pending" function needs to go back and try
+ * to grant locks on the convert queue again since the first lock can now be
+ * granted.
+ */
+
+static int conversion_deadlock_detect(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
+{
+ struct dlm_lkb *this, *first = NULL, *self = NULL;
+
+ list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
+ if (!first)
+ first = this;
+ if (this == lkb) {
+ self = lkb;
+ continue;
+ }
+
+ if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
+ return 1;
+ }
+
+ /* if lkb is on the convert queue and is preventing the first
+ from being granted, then there's deadlock and we demote lkb.
+ multiple converting locks may need to do this before the first
+ converting lock can be granted. */
+
+ if (self && self != first) {
+ if (!modes_compat(lkb, first) &&
+ !queue_conflict(&rsb->res_grantqueue, first))
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Return 1 if the lock can be granted, 0 otherwise.
+ * Also detect and resolve conversion deadlocks.
+ *
+ * lkb is the lock to be granted
+ *
+ * now is 1 if the function is being called in the context of the
+ * immediate request, it is 0 if called later, after the lock has been
+ * queued.
+ *
+ * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
+ */
+
+static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
+{
+ int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
+
+ /*
+ * 6-10: Version 5.4 introduced an option to address the phenomenon of
+ * a new request for a NL mode lock being blocked.
+ *
+ * 6-11: If the optional EXPEDITE flag is used with the new NL mode
+ * request, then it would be granted. In essence, the use of this flag
+ * tells the Lock Manager to expedite theis request by not considering
+ * what may be in the CONVERTING or WAITING queues... As of this
+ * writing, the EXPEDITE flag can be used only with new requests for NL
+ * mode locks. This flag is not valid for conversion requests.
+ *
+ * A shortcut. Earlier checks return an error if EXPEDITE is used in a
+ * conversion or used with a non-NL requested mode. We also know an
+ * EXPEDITE request is always granted immediately, so now must always
+ * be 1. The full condition to grant an expedite request: (now &&
+ * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
+ * therefore be shortened to just checking the flag.
+ */
+
+ if (lkb->lkb_exflags & DLM_LKF_EXPEDITE)
+ return 1;
+
+ /*
+ * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
+ * added to the remaining conditions.
+ */
+
+ if (queue_conflict(&r->res_grantqueue, lkb))
+ goto out;
+
+ /*
+ * 6-3: By default, a conversion request is immediately granted if the
+ * requested mode is compatible with the modes of all other granted
+ * locks
+ */
+
+ if (queue_conflict(&r->res_convertqueue, lkb))
+ goto out;
+
+ /*
+ * 6-5: But the default algorithm for deciding whether to grant or
+ * queue conversion requests does not by itself guarantee that such
+ * requests are serviced on a "first come first serve" basis. This, in
+ * turn, can lead to a phenomenon known as "indefinate postponement".
+ *
+ * 6-7: This issue is dealt with by using the optional QUECVT flag with
+ * the system service employed to request a lock conversion. This flag
+ * forces certain conversion requests to be queued, even if they are
+ * compatible with the granted modes of other locks on the same
+ * resource. Thus, the use of this flag results in conversion requests
+ * being ordered on a "first come first servce" basis.
+ *
+ * DCT: This condition is all about new conversions being able to occur
+ * "in place" while the lock remains on the granted queue (assuming
+ * nothing else conflicts.) IOW if QUECVT isn't set, a conversion
+ * doesn't _have_ to go onto the convert queue where it's processed in
+ * order. The "now" variable is necessary to distinguish converts
+ * being received and processed for the first time now, because once a
+ * convert is moved to the conversion queue the condition below applies
+ * requiring fifo granting.
+ */
+
+ if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT))
+ return 1;
+
+ /*
+ * The NOORDER flag is set to avoid the standard vms rules on grant
+ * order.
+ */
+
+ if (lkb->lkb_exflags & DLM_LKF_NOORDER)
+ return 1;
+
+ /*
+ * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
+ * granted until all other conversion requests ahead of it are granted
+ * and/or canceled.
+ */
+
+ if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
+ return 1;
+
+ /*
+ * 6-4: By default, a new request is immediately granted only if all
+ * three of the following conditions are satisfied when the request is
+ * issued:
+ * - The queue of ungranted conversion requests for the resource is
+ * empty.
+ * - The queue of ungranted new requests for the resource is empty.
+ * - The mode of the new request is compatible with the most
+ * restrictive mode of all granted locks on the resource.
+ */
+
+ if (now && !conv && list_empty(&r->res_convertqueue) &&
+ list_empty(&r->res_waitqueue))
+ return 1;
+
+ /*
+ * 6-4: Once a lock request is in the queue of ungranted new requests,
+ * it cannot be granted until the queue of ungranted conversion
+ * requests is empty, all ungranted new requests ahead of it are
+ * granted and/or canceled, and it is compatible with the granted mode
+ * of the most restrictive lock granted on the resource.
+ */
+
+ if (!now && !conv && list_empty(&r->res_convertqueue) &&
+ first_in_list(lkb, &r->res_waitqueue))
+ return 1;
+
+ out:
+ /*
+ * The following, enabled by CONVDEADLK, departs from VMS.
+ */
+
+ if (conv && (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) &&
+ conversion_deadlock_detect(r, lkb)) {
+ lkb->lkb_grmode = DLM_LOCK_NL;
+ lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
+ }
+
+ return 0;
+}
+
+/*
+ * The ALTPR and ALTCW flags aren't traditional lock manager flags, but are a
+ * simple way to provide a big optimization to applications that can use them.
+ */
+
+static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
+{
+ uint32_t flags = lkb->lkb_exflags;
+ int rv;
+ int8_t alt = 0, rqmode = lkb->lkb_rqmode;
+
+ rv = _can_be_granted(r, lkb, now);
+ if (rv)
+ goto out;
+
+ if (lkb->lkb_sbflags & DLM_SBF_DEMOTED)
+ goto out;
+
+ if (rqmode != DLM_LOCK_PR && flags & DLM_LKF_ALTPR)
+ alt = DLM_LOCK_PR;
+ else if (rqmode != DLM_LOCK_CW && flags & DLM_LKF_ALTCW)
+ alt = DLM_LOCK_CW;
+
+ if (alt) {
+ lkb->lkb_rqmode = alt;
+ rv = _can_be_granted(r, lkb, now);
+ if (rv)
+ lkb->lkb_sbflags |= DLM_SBF_ALTMODE;
+ else
+ lkb->lkb_rqmode = rqmode;
+ }
+ out:
+ return rv;
+}
+
+static int grant_pending_convert(struct dlm_rsb *r, int high)
+{
+ struct dlm_lkb *lkb, *s;
+ int hi, demoted, quit, grant_restart, demote_restart;
+
+ quit = 0;
+ restart:
+ grant_restart = 0;
+ demote_restart = 0;
+ hi = DLM_LOCK_IV;
+
+ list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
+ demoted = is_demoted(lkb);
+ if (can_be_granted(r, lkb, 0)) {
+ grant_lock_pending(r, lkb);
+ grant_restart = 1;
+ } else {
+ hi = max_t(int, lkb->lkb_rqmode, hi);
+ if (!demoted && is_demoted(lkb))
+ demote_restart = 1;
+ }
+ }
+
+ if (grant_restart)
+ goto restart;
+ if (demote_restart && !quit) {
+ quit = 1;
+ goto restart;
+ }
+
+ return max_t(int, high, hi);
+}
+
+static int grant_pending_wait(struct dlm_rsb *r, int high)
+{
+ struct dlm_lkb *lkb, *s;
+
+ list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
+ if (can_be_granted(r, lkb, 0))
+ grant_lock_pending(r, lkb);
+ else
+ high = max_t(int, lkb->lkb_rqmode, high);
+ }
+
+ return high;
+}
+
+static void grant_pending_locks(struct dlm_rsb *r)
+{
+ struct dlm_lkb *lkb, *s;
+ int high = DLM_LOCK_IV;
+
+ DLM_ASSERT(is_master(r), dlm_dump_rsb(r););
+
+ high = grant_pending_convert(r, high);
+ high = grant_pending_wait(r, high);
+
+ if (high == DLM_LOCK_IV)
+ return;
+
+ /*
+ * If there are locks left on the wait/convert queue then send blocking
+ * ASTs to granted locks based on the largest requested mode (high)
+ * found above. FIXME: highbast < high comparison not valid for PR/CW.
+ */
+
+ list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
+ if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) &&
+ !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
+ queue_bast(r, lkb, high);
+ lkb->lkb_highbast = high;
+ }
+ }
+}
+
+static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
+ struct dlm_lkb *lkb)
+{
+ struct dlm_lkb *gr;
+
+ list_for_each_entry(gr, head, lkb_statequeue) {
+ if (gr->lkb_bastaddr &&
+ gr->lkb_highbast < lkb->lkb_rqmode &&
+ !modes_compat(gr, lkb)) {
+ queue_bast(r, gr, lkb->lkb_rqmode);
+ gr->lkb_highbast = lkb->lkb_rqmode;
+ }
+ }
+}
+
+static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ send_bast_queue(r, &r->res_grantqueue, lkb);
+}
+
+static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ send_bast_queue(r, &r->res_grantqueue, lkb);
+ send_bast_queue(r, &r->res_convertqueue, lkb);
+}
+
+/* set_master(r, lkb) -- set the master nodeid of a resource
+
+ The purpose of this function is to set the nodeid field in the given
+ lkb using the nodeid field in the given rsb. If the rsb's nodeid is
+ known, it can just be copied to the lkb and the function will return
+ 0. If the rsb's nodeid is _not_ known, it needs to be looked up
+ before it can be copied to the lkb.
+
+ When the rsb nodeid is being looked up remotely, the initial lkb
+ causing the lookup is kept on the ls_waiters list waiting for the
+ lookup reply. Other lkb's waiting for the same rsb lookup are kept
+ on the rsb's res_lookup list until the master is verified.
+
+ Return values:
+ 0: nodeid is set in rsb/lkb and the caller should go ahead and use it
+ 1: the rsb master is not available and the lkb has been placed on
+ a wait queue
+*/
+
+static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ struct dlm_ls *ls = r->res_ls;
+ int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
+
+ if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
+ rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
+ r->res_first_lkid = lkb->lkb_id;
+ lkb->lkb_nodeid = r->res_nodeid;
+ return 0;
+ }
+
+ if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) {
+ list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup);
+ return 1;
+ }
+
+ if (r->res_nodeid == 0) {
+ lkb->lkb_nodeid = 0;
+ return 0;
+ }
+
+ if (r->res_nodeid > 0) {
+ lkb->lkb_nodeid = r->res_nodeid;
+ return 0;
+ }
+
+ DLM_ASSERT(r->res_nodeid == -1, dlm_dump_rsb(r););
+
+ dir_nodeid = dlm_dir_nodeid(r);
+
+ if (dir_nodeid != our_nodeid) {
+ r->res_first_lkid = lkb->lkb_id;
+ send_lookup(r, lkb);
+ return 1;
+ }
+
+ for (;;) {
+ /* It's possible for dlm_scand to remove an old rsb for
+ this same resource from the toss list, us to create
+ a new one, look up the master locally, and find it
+ already exists just before dlm_scand does the
+ dir_remove() on the previous rsb. */
+
+ error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
+ r->res_length, &ret_nodeid);
+ if (!error)
+ break;
+ log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
+ schedule();
+ }
+
+ if (ret_nodeid == our_nodeid) {
+ r->res_first_lkid = 0;
+ r->res_nodeid = 0;
+ lkb->lkb_nodeid = 0;
+ } else {
+ r->res_first_lkid = lkb->lkb_id;
+ r->res_nodeid = ret_nodeid;
+ lkb->lkb_nodeid = ret_nodeid;
+ }
+ return 0;
+}
+
+static void process_lookup_list(struct dlm_rsb *r)
+{
+ struct dlm_lkb *lkb, *safe;
+
+ list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
+ list_del(&lkb->lkb_rsb_lookup);
+ _request_lock(r, lkb);
+ schedule();
+ }
+}
+
+/* confirm_master -- confirm (or deny) an rsb's master nodeid */
+
+static void confirm_master(struct dlm_rsb *r, int error)
+{
+ struct dlm_lkb *lkb;
+
+ if (!r->res_first_lkid)
+ return;
+
+ switch (error) {
+ case 0:
+ case -EINPROGRESS:
+ r->res_first_lkid = 0;
+ process_lookup_list(r);
+ break;
+
+ case -EAGAIN:
+ /* the remote master didn't queue our NOQUEUE request;
+ make a waiting lkb the first_lkid */
+
+ r->res_first_lkid = 0;
+
+ if (!list_empty(&r->res_lookup)) {
+ lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
+ lkb_rsb_lookup);
+ list_del(&lkb->lkb_rsb_lookup);
+ r->res_first_lkid = lkb->lkb_id;
+ _request_lock(r, lkb);
+ } else
+ r->res_nodeid = -1;
+ break;
+
+ default:
+ log_error(r->res_ls, "confirm_master unknown error %d", error);
+ }
+}
+
+static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
+ int namelen, uint32_t parent_lkid, void *ast,
+ void *astarg, void *bast, struct dlm_args *args)
+{
+ int rv = -EINVAL;
+
+ /* check for invalid arg usage */
+
+ if (mode < 0 || mode > DLM_LOCK_EX)
+ goto out;
+
+ if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
+ goto out;
+
+ if (flags & DLM_LKF_CANCEL)
+ goto out;
+
+ if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
+ goto out;
+
+ if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
+ goto out;
+
+ if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
+ goto out;
+
+ if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
+ goto out;
+
+ if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
+ goto out;
+
+ if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
+ goto out;
+
+ if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL)
+ goto out;
+
+ if (!ast || !lksb)
+ goto out;
+
+ if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr)
+ goto out;
+
+ /* parent/child locks not yet supported */
+ if (parent_lkid)
+ goto out;
+
+ if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid)
+ goto out;
+
+ /* these args will be copied to the lkb in validate_lock_args,
+ it cannot be done now because when converting locks, fields in
+ an active lkb cannot be modified before locking the rsb */
+
+ args->flags = flags;
+ args->astaddr = ast;
+ args->astparam = (long) astarg;
+ args->bastaddr = bast;
+ args->mode = mode;
+ args->lksb = lksb;
+ rv = 0;
+ out:
+ return rv;
+}
+
+static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
+{
+ if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK |
+ DLM_LKF_FORCEUNLOCK))
+ return -EINVAL;
+
+ args->flags = flags;
+ args->astparam = (long) astarg;
+ return 0;
+}
+
+static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_args *args)
+{
+ int rv = -EINVAL;
+
+ if (args->flags & DLM_LKF_CONVERT) {
+ if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+ goto out;
+
+ if (args->flags & DLM_LKF_QUECVT &&
+ !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
+ goto out;
+
+ rv = -EBUSY;
+ if (lkb->lkb_status != DLM_LKSTS_GRANTED)
+ goto out;
+
+ if (lkb->lkb_wait_type)
+ goto out;
+ }
+
+ lkb->lkb_exflags = args->flags;
+ lkb->lkb_sbflags = 0;
+ lkb->lkb_astaddr = args->astaddr;
+ lkb->lkb_astparam = args->astparam;
+ lkb->lkb_bastaddr = args->bastaddr;
+ lkb->lkb_rqmode = args->mode;
+ lkb->lkb_lksb = args->lksb;
+ lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
+ lkb->lkb_ownpid = (int) current->pid;
+ rv = 0;
+ out:
+ return rv;
+}
+
+static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
+{
+ int rv = -EINVAL;
+
+ if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+ goto out;
+
+ if (args->flags & DLM_LKF_FORCEUNLOCK)
+ goto out_ok;
+
+ if (args->flags & DLM_LKF_CANCEL &&
+ lkb->lkb_status == DLM_LKSTS_GRANTED)
+ goto out;
+
+ if (!(args->flags & DLM_LKF_CANCEL) &&
+ lkb->lkb_status != DLM_LKSTS_GRANTED)
+ goto out;
+
+ rv = -EBUSY;
+ if (lkb->lkb_wait_type)
+ goto out;
+
+ out_ok:
+ lkb->lkb_exflags = args->flags;
+ lkb->lkb_sbflags = 0;
+ lkb->lkb_astparam = args->astparam;
+
+ rv = 0;
+ out:
+ return rv;
+}
+
+/*
+ * Four stage 4 varieties:
+ * do_request(), do_convert(), do_unlock(), do_cancel()
+ * These are called on the master node for the given lock and
+ * from the central locking logic.
+ */
+
+static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error = 0;
+
+ if (can_be_granted(r, lkb, 1)) {
+ grant_lock(r, lkb);
+ queue_cast(r, lkb, 0);
+ goto out;
+ }
+
+ if (can_be_queued(lkb)) {
+ error = -EINPROGRESS;
+ add_lkb(r, lkb, DLM_LKSTS_WAITING);
+ send_blocking_asts(r, lkb);
+ goto out;
+ }
+
+ error = -EAGAIN;
+ if (force_blocking_asts(lkb))
+ send_blocking_asts_all(r, lkb);
+ queue_cast(r, lkb, -EAGAIN);
+
+ out:
+ return error;
+}
+
+static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error = 0;
+
+ /* changing an existing lock may allow others to be granted */
+
+ if (can_be_granted(r, lkb, 1)) {
+ grant_lock(r, lkb);
+ queue_cast(r, lkb, 0);
+ grant_pending_locks(r);
+ goto out;
+ }
+
+ if (can_be_queued(lkb)) {
+ if (is_demoted(lkb))
+ grant_pending_locks(r);
+ error = -EINPROGRESS;
+ del_lkb(r, lkb);
+ add_lkb(r, lkb, DLM_LKSTS_CONVERT);
+ send_blocking_asts(r, lkb);
+ goto out;
+ }
+
+ error = -EAGAIN;
+ if (force_blocking_asts(lkb))
+ send_blocking_asts_all(r, lkb);
+ queue_cast(r, lkb, -EAGAIN);
+
+ out:
+ return error;
+}
+
+static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ remove_lock(r, lkb);
+ queue_cast(r, lkb, -DLM_EUNLOCK);
+ grant_pending_locks(r);
+ return -DLM_EUNLOCK;
+}
+
+/* FIXME: if revert_lock() finds that the lkb is granted, we should
+ skip the queue_cast(ECANCEL). It indicates that the request/convert
+ completed (and queued a normal ast) just before the cancel; we don't
+ want to clobber the sb_result for the normal ast with ECANCEL. */
+
+static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ revert_lock(r, lkb);
+ queue_cast(r, lkb, -DLM_ECANCEL);
+ grant_pending_locks(r);
+ return -DLM_ECANCEL;
+}
+
+/*
+ * Four stage 3 varieties:
+ * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
+ */
+
+/* add a new lkb to a possibly new rsb, called by requesting process */
+
+static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error;
+
+ /* set_master: sets lkb nodeid from r */
+
+ error = set_master(r, lkb);
+ if (error < 0)
+ goto out;
+ if (error) {
+ error = 0;
+ goto out;
+ }
+
+ if (is_remote(r))
+ /* receive_request() calls do_request() on remote node */
+ error = send_request(r, lkb);
+ else
+ error = do_request(r, lkb);
+ out:
+ return error;
+}
+
+/* change some property of an existing lkb, e.g. mode */
+
+static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error;
+
+ if (is_remote(r))
+ /* receive_convert() calls do_convert() on remote node */
+ error = send_convert(r, lkb);
+ else
+ error = do_convert(r, lkb);
+
+ return error;
+}
+
+/* remove an existing lkb from the granted queue */
+
+static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error;
+
+ if (is_remote(r))
+ /* receive_unlock() calls do_unlock() on remote node */
+ error = send_unlock(r, lkb);
+ else
+ error = do_unlock(r, lkb);
+
+ return error;
+}
+
+/* remove an existing lkb from the convert or wait queue */
+
+static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error;
+
+ if (is_remote(r))
+ /* receive_cancel() calls do_cancel() on remote node */
+ error = send_cancel(r, lkb);
+ else
+ error = do_cancel(r, lkb);
+
+ return error;
+}
+
+/*
+ * Four stage 2 varieties:
+ * request_lock(), convert_lock(), unlock_lock(), cancel_lock()
+ */
+
+static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
+ int len, struct dlm_args *args)
+{
+ struct dlm_rsb *r;
+ int error;
+
+ error = validate_lock_args(ls, lkb, args);
+ if (error)
+ goto out;
+
+ error = find_rsb(ls, name, len, R_CREATE, &r);
+ if (error)
+ goto out;
+
+ lock_rsb(r);
+
+ attach_lkb(r, lkb);
+ lkb->lkb_lksb->sb_lkid = lkb->lkb_id;
+
+ error = _request_lock(r, lkb);
+
+ unlock_rsb(r);
+ put_rsb(r);
+
+ out:
+ return error;
+}
+
+static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_args *args)
+{
+ struct dlm_rsb *r;
+ int error;
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ error = validate_lock_args(ls, lkb, args);
+ if (error)
+ goto out;
+
+ error = _convert_lock(r, lkb);
+ out:
+ unlock_rsb(r);
+ put_rsb(r);
+ return error;
+}
+
+static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_args *args)
+{
+ struct dlm_rsb *r;
+ int error;
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ error = validate_unlock_args(lkb, args);
+ if (error)
+ goto out;
+
+ error = _unlock_lock(r, lkb);
+ out:
+ unlock_rsb(r);
+ put_rsb(r);
+ return error;
+}
+
+static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_args *args)
+{
+ struct dlm_rsb *r;
+ int error;
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ error = validate_unlock_args(lkb, args);
+ if (error)
+ goto out;
+
+ error = _cancel_lock(r, lkb);
+ out:
+ unlock_rsb(r);
+ put_rsb(r);
+ return error;
+}
+
+/*
+ * Two stage 1 varieties: dlm_lock() and dlm_unlock()
+ */
+
+int dlm_lock(dlm_lockspace_t *lockspace,
+ int mode,
+ struct dlm_lksb *lksb,
+ uint32_t flags,
+ void *name,
+ unsigned int namelen,
+ uint32_t parent_lkid,
+ void (*ast) (void *astarg),
+ void *astarg,
+ void (*bast) (void *astarg, int mode))
+{
+ struct dlm_ls *ls;
+ struct dlm_lkb *lkb;
+ struct dlm_args args;
+ int error, convert = flags & DLM_LKF_CONVERT;
+
+ ls = dlm_find_lockspace_local(lockspace);
+ if (!ls)
+ return -EINVAL;
+
+ lock_recovery(ls);
+
+ if (convert)
+ error = find_lkb(ls, lksb->sb_lkid, &lkb);
+ else
+ error = create_lkb(ls, &lkb);
+
+ if (error)
+ goto out;
+
+ error = set_lock_args(mode, lksb, flags, namelen, parent_lkid, ast,
+ astarg, bast, &args);
+ if (error)
+ goto out_put;
+
+ if (convert)
+ error = convert_lock(ls, lkb, &args);
+ else
+ error = request_lock(ls, lkb, name, namelen, &args);
+
+ if (error == -EINPROGRESS)
+ error = 0;
+ out_put:
+ if (convert || error)
+ __put_lkb(ls, lkb);
+ if (error == -EAGAIN)
+ error = 0;
+ out:
+ unlock_recovery(ls);
+ dlm_put_lockspace(ls);
+ return error;
+}
+
+int dlm_unlock(dlm_lockspace_t *lockspace,
+ uint32_t lkid,
+ uint32_t flags,
+ struct dlm_lksb *lksb,
+ void *astarg)
+{
+ struct dlm_ls *ls;
+ struct dlm_lkb *lkb;
+ struct dlm_args args;
+ int error;
+
+ ls = dlm_find_lockspace_local(lockspace);
+ if (!ls)
+ return -EINVAL;
+
+ lock_recovery(ls);
+
+ error = find_lkb(ls, lkid, &lkb);
+ if (error)
+ goto out;
+
+ error = set_unlock_args(flags, astarg, &args);
+ if (error)
+ goto out_put;
+
+ if (flags & DLM_LKF_CANCEL)
+ error = cancel_lock(ls, lkb, &args);
+ else
+ error = unlock_lock(ls, lkb, &args);
+
+ if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
+ error = 0;
+ out_put:
+ dlm_put_lkb(lkb);
+ out:
+ unlock_recovery(ls);
+ dlm_put_lockspace(ls);
+ return error;
+}
+
+/*
+ * send/receive routines for remote operations and replies
+ *
+ * send_args
+ * send_common
+ * send_request receive_request
+ * send_convert receive_convert
+ * send_unlock receive_unlock
+ * send_cancel receive_cancel
+ * send_grant receive_grant
+ * send_bast receive_bast
+ * send_lookup receive_lookup
+ * send_remove receive_remove
+ *
+ * send_common_reply
+ * receive_request_reply send_request_reply
+ * receive_convert_reply send_convert_reply
+ * receive_unlock_reply send_unlock_reply
+ * receive_cancel_reply send_cancel_reply
+ * receive_lookup_reply send_lookup_reply
+ */
+
+static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ int to_nodeid, int mstype,
+ struct dlm_message **ms_ret,
+ struct dlm_mhandle **mh_ret)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ char *mb;
+ int mb_len = sizeof(struct dlm_message);
+
+ switch (mstype) {
+ case DLM_MSG_REQUEST:
+ case DLM_MSG_LOOKUP:
+ case DLM_MSG_REMOVE:
+ mb_len += r->res_length;
+ break;
+ case DLM_MSG_CONVERT:
+ case DLM_MSG_UNLOCK:
+ case DLM_MSG_REQUEST_REPLY:
+ case DLM_MSG_CONVERT_REPLY:
+ case DLM_MSG_GRANT:
+ if (lkb && lkb->lkb_lvbptr)
+ mb_len += r->res_ls->ls_lvblen;
+ break;
+ }
+
+ /* get_buffer gives us a message handle (mh) that we need to
+ pass into lowcomms_commit and a message buffer (mb) that we
+ write our data into */
+
+ mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
+ if (!mh)
+ return -ENOBUFS;
+
+ memset(mb, 0, mb_len);
+
+ ms = (struct dlm_message *) mb;
+
+ ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ ms->m_header.h_lockspace = r->res_ls->ls_global_id;
+ ms->m_header.h_nodeid = dlm_our_nodeid();
+ ms->m_header.h_length = mb_len;
+ ms->m_header.h_cmd = DLM_MSG;
+
+ ms->m_type = mstype;
+
+ *mh_ret = mh;
+ *ms_ret = ms;
+ return 0;
+}
+
+/* further lowcomms enhancements or alternate implementations may make
+ the return value from this function useful at some point */
+
+static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
+{
+ dlm_message_out(ms);
+ dlm_lowcomms_commit_buffer(mh);
+ return 0;
+}
+
+static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ ms->m_nodeid = lkb->lkb_nodeid;
+ ms->m_pid = lkb->lkb_ownpid;
+ ms->m_lkid = lkb->lkb_id;
+ ms->m_remid = lkb->lkb_remid;
+ ms->m_exflags = lkb->lkb_exflags;
+ ms->m_sbflags = lkb->lkb_sbflags;
+ ms->m_flags = lkb->lkb_flags;
+ ms->m_lvbseq = lkb->lkb_lvbseq;
+ ms->m_status = lkb->lkb_status;
+ ms->m_grmode = lkb->lkb_grmode;
+ ms->m_rqmode = lkb->lkb_rqmode;
+ ms->m_hash = r->res_hash;
+
+ /* m_result and m_bastmode are set from function args,
+ not from lkb fields */
+
+ if (lkb->lkb_bastaddr)
+ ms->m_asts |= AST_BAST;
+ if (lkb->lkb_astaddr)
+ ms->m_asts |= AST_COMP;
+
+ if (ms->m_type == DLM_MSG_REQUEST || ms->m_type == DLM_MSG_LOOKUP)
+ memcpy(ms->m_extra, r->res_name, r->res_length);
+
+ else if (lkb->lkb_lvbptr)
+ memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+
+}
+
+static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int to_nodeid, error;
+
+ add_to_waiters(lkb, mstype);
+
+ to_nodeid = r->res_nodeid;
+
+ error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
+ if (error)
+ goto fail;
+
+ send_args(r, lkb, ms);
+
+ error = send_message(mh, ms);
+ if (error)
+ goto fail;
+ return 0;
+
+ fail:
+ remove_from_waiters(lkb);
+ return error;
+}
+
+static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ return send_common(r, lkb, DLM_MSG_REQUEST);
+}
+
+static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error;
+
+ error = send_common(r, lkb, DLM_MSG_CONVERT);
+
+ /* down conversions go without a reply from the master */
+ if (!error && down_conversion(lkb)) {
+ remove_from_waiters(lkb);
+ r->res_ls->ls_stub_ms.m_result = 0;
+ r->res_ls->ls_stub_ms.m_flags = lkb->lkb_flags;
+ __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
+ }
+
+ return error;
+}
+
+/* FIXME: if this lkb is the only lock we hold on the rsb, then set
+ MASTER_UNCERTAIN to force the next request on the rsb to confirm
+ that the master is still correct. */
+
+static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ return send_common(r, lkb, DLM_MSG_UNLOCK);
+}
+
+static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ return send_common(r, lkb, DLM_MSG_CANCEL);
+}
+
+static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int to_nodeid, error;
+
+ to_nodeid = lkb->lkb_nodeid;
+
+ error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
+ if (error)
+ goto out;
+
+ send_args(r, lkb, ms);
+
+ ms->m_result = 0;
+
+ error = send_message(mh, ms);
+ out:
+ return error;
+}
+
+static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int to_nodeid, error;
+
+ to_nodeid = lkb->lkb_nodeid;
+
+ error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh);
+ if (error)
+ goto out;
+
+ send_args(r, lkb, ms);
+
+ ms->m_bastmode = mode;
+
+ error = send_message(mh, ms);
+ out:
+ return error;
+}
+
+static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int to_nodeid, error;
+
+ add_to_waiters(lkb, DLM_MSG_LOOKUP);
+
+ to_nodeid = dlm_dir_nodeid(r);
+
+ error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
+ if (error)
+ goto fail;
+
+ send_args(r, lkb, ms);
+
+ error = send_message(mh, ms);
+ if (error)
+ goto fail;
+ return 0;
+
+ fail:
+ remove_from_waiters(lkb);
+ return error;
+}
+
+static int send_remove(struct dlm_rsb *r)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int to_nodeid, error;
+
+ to_nodeid = dlm_dir_nodeid(r);
+
+ error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
+ if (error)
+ goto out;
+
+ memcpy(ms->m_extra, r->res_name, r->res_length);
+ ms->m_hash = r->res_hash;
+
+ error = send_message(mh, ms);
+ out:
+ return error;
+}
+
+static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ int mstype, int rv)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int to_nodeid, error;
+
+ to_nodeid = lkb->lkb_nodeid;
+
+ error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
+ if (error)
+ goto out;
+
+ send_args(r, lkb, ms);
+
+ ms->m_result = rv;
+
+ error = send_message(mh, ms);
+ out:
+ return error;
+}
+
+static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+ return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv);
+}
+
+static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+ return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv);
+}
+
+static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+ return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv);
+}
+
+static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+ return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv);
+}
+
+static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
+ int ret_nodeid, int rv)
+{
+ struct dlm_rsb *r = &ls->ls_stub_rsb;
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int error, nodeid = ms_in->m_header.h_nodeid;
+
+ error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
+ if (error)
+ goto out;
+
+ ms->m_lkid = ms_in->m_lkid;
+ ms->m_result = rv;
+ ms->m_nodeid = ret_nodeid;
+
+ error = send_message(mh, ms);
+ out:
+ return error;
+}
+
+/* which args we save from a received message depends heavily on the type
+ of message, unlike the send side where we can safely send everything about
+ the lkb for any type of message */
+
+static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+ lkb->lkb_exflags = ms->m_exflags;
+ lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
+ (ms->m_flags & 0x0000FFFF);
+}
+
+static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+ lkb->lkb_sbflags = ms->m_sbflags;
+ lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
+ (ms->m_flags & 0x0000FFFF);
+}
+
+static int receive_extralen(struct dlm_message *ms)
+{
+ return (ms->m_header.h_length - sizeof(struct dlm_message));
+}
+
+static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ int len;
+
+ if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
+ if (!lkb->lkb_lvbptr)
+ lkb->lkb_lvbptr = allocate_lvb(ls);
+ if (!lkb->lkb_lvbptr)
+ return -ENOMEM;
+ len = receive_extralen(ms);
+ memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
+ }
+ return 0;
+}
+
+static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ lkb->lkb_nodeid = ms->m_header.h_nodeid;
+ lkb->lkb_ownpid = ms->m_pid;
+ lkb->lkb_remid = ms->m_lkid;
+ lkb->lkb_grmode = DLM_LOCK_IV;
+ lkb->lkb_rqmode = ms->m_rqmode;
+ lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
+ lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
+
+ DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
+
+ if (receive_lvb(ls, lkb, ms))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
+ log_error(ls, "convert_args nodeid %d %d lkid %x %x",
+ lkb->lkb_nodeid, ms->m_header.h_nodeid,
+ lkb->lkb_id, lkb->lkb_remid);
+ return -EINVAL;
+ }
+
+ if (!is_master_copy(lkb))
+ return -EINVAL;
+
+ if (lkb->lkb_status != DLM_LKSTS_GRANTED)
+ return -EBUSY;
+
+ if (receive_lvb(ls, lkb, ms))
+ return -ENOMEM;
+
+ lkb->lkb_rqmode = ms->m_rqmode;
+ lkb->lkb_lvbseq = ms->m_lvbseq;
+
+ return 0;
+}
+
+static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ if (!is_master_copy(lkb))
+ return -EINVAL;
+ if (receive_lvb(ls, lkb, ms))
+ return -ENOMEM;
+ return 0;
+}
+
+/* We fill in the stub-lkb fields with the info that send_xxxx_reply()
+ uses to send a reply and that the remote end uses to process the reply. */
+
+static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb = &ls->ls_stub_lkb;
+ lkb->lkb_nodeid = ms->m_header.h_nodeid;
+ lkb->lkb_remid = ms->m_lkid;
+}
+
+static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error, namelen;
+
+ error = create_lkb(ls, &lkb);
+ if (error)
+ goto fail;
+
+ receive_flags(lkb, ms);
+ lkb->lkb_flags |= DLM_IFL_MSTCPY;
+ error = receive_request_args(ls, lkb, ms);
+ if (error) {
+ __put_lkb(ls, lkb);
+ goto fail;
+ }
+
+ namelen = receive_extralen(ms);
+
+ error = find_rsb(ls, ms->m_extra, namelen, R_MASTER, &r);
+ if (error) {
+ __put_lkb(ls, lkb);
+ goto fail;
+ }
+
+ lock_rsb(r);
+
+ attach_lkb(r, lkb);
+ error = do_request(r, lkb);
+ send_request_reply(r, lkb, error);
+
+ unlock_rsb(r);
+ put_rsb(r);
+
+ if (error == -EINPROGRESS)
+ error = 0;
+ if (error)
+ dlm_put_lkb(lkb);
+ return;
+
+ fail:
+ setup_stub_lkb(ls, ms);
+ send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+
+static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error, reply = 1;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error)
+ goto fail;
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ receive_flags(lkb, ms);
+ error = receive_convert_args(ls, lkb, ms);
+ if (error)
+ goto out;
+ reply = !down_conversion(lkb);
+
+ error = do_convert(r, lkb);
+ out:
+ if (reply)
+ send_convert_reply(r, lkb, error);
+
+ unlock_rsb(r);
+ put_rsb(r);
+ dlm_put_lkb(lkb);
+ return;
+
+ fail:
+ setup_stub_lkb(ls, ms);
+ send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+
+static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error)
+ goto fail;
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ receive_flags(lkb, ms);
+ error = receive_unlock_args(ls, lkb, ms);
+ if (error)
+ goto out;
+
+ error = do_unlock(r, lkb);
+ out:
+ send_unlock_reply(r, lkb, error);
+
+ unlock_rsb(r);
+ put_rsb(r);
+ dlm_put_lkb(lkb);
+ return;
+
+ fail:
+ setup_stub_lkb(ls, ms);
+ send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+
+static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error)
+ goto fail;
+
+ receive_flags(lkb, ms);
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ error = do_cancel(r, lkb);
+ send_cancel_reply(r, lkb, error);
+
+ unlock_rsb(r);
+ put_rsb(r);
+ dlm_put_lkb(lkb);
+ return;
+
+ fail:
+ setup_stub_lkb(ls, ms);
+ send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+
+static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error) {
+ log_error(ls, "receive_grant no lkb");
+ return;
+ }
+ DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ receive_flags_reply(lkb, ms);
+ grant_lock_pc(r, lkb, ms);
+ queue_cast(r, lkb, 0);
+
+ unlock_rsb(r);
+ put_rsb(r);
+ dlm_put_lkb(lkb);
+}
+
+static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error) {
+ log_error(ls, "receive_bast no lkb");
+ return;
+ }
+ DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ queue_bast(r, lkb, ms->m_bastmode);
+
+ unlock_rsb(r);
+ put_rsb(r);
+ dlm_put_lkb(lkb);
+}
+
+static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ int len, error, ret_nodeid, dir_nodeid, from_nodeid, our_nodeid;
+
+ from_nodeid = ms->m_header.h_nodeid;
+ our_nodeid = dlm_our_nodeid();
+
+ len = receive_extralen(ms);
+
+ dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
+ if (dir_nodeid != our_nodeid) {
+ log_error(ls, "lookup dir_nodeid %d from %d",
+ dir_nodeid, from_nodeid);
+ error = -EINVAL;
+ ret_nodeid = -1;
+ goto out;
+ }
+
+ error = dlm_dir_lookup(ls, from_nodeid, ms->m_extra, len, &ret_nodeid);
+
+ /* Optimization: we're master so treat lookup as a request */
+ if (!error && ret_nodeid == our_nodeid) {
+ receive_request(ls, ms);
+ return;
+ }
+ out:
+ send_lookup_reply(ls, ms, ret_nodeid, error);
+}
+
+static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ int len, dir_nodeid, from_nodeid;
+
+ from_nodeid = ms->m_header.h_nodeid;
+
+ len = receive_extralen(ms);
+
+ dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
+ if (dir_nodeid != dlm_our_nodeid()) {
+ log_error(ls, "remove dir entry dir_nodeid %d from %d",
+ dir_nodeid, from_nodeid);
+ return;
+ }
+
+ dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len);
+}
+
+static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error, mstype;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error) {
+ log_error(ls, "receive_request_reply no lkb");
+ return;
+ }
+ DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+ mstype = lkb->lkb_wait_type;
+ error = remove_from_waiters(lkb);
+ if (error) {
+ log_error(ls, "receive_request_reply not on waiters");
+ goto out;
+ }
+
+ /* this is the value returned from do_request() on the master */
+ error = ms->m_result;
+
+ r = lkb->lkb_resource;
+ hold_rsb(r);
+ lock_rsb(r);
+
+ /* Optimization: the dir node was also the master, so it took our
+ lookup as a request and sent request reply instead of lookup reply */
+ if (mstype == DLM_MSG_LOOKUP) {
+ r->res_nodeid = ms->m_header.h_nodeid;
+ lkb->lkb_nodeid = r->res_nodeid;
+ }
+
+ switch (error) {
+ case -EAGAIN:
+ /* request would block (be queued) on remote master;
+ the unhold undoes the original ref from create_lkb()
+ so it leads to the lkb being freed */
+ queue_cast(r, lkb, -EAGAIN);
+ confirm_master(r, -EAGAIN);
+ unhold_lkb(lkb);
+ break;
+
+ case -EINPROGRESS:
+ case 0:
+ /* request was queued or granted on remote master */
+ receive_flags_reply(lkb, ms);
+ lkb->lkb_remid = ms->m_lkid;
+ if (error)
+ add_lkb(r, lkb, DLM_LKSTS_WAITING);
+ else {
+ grant_lock_pc(r, lkb, ms);
+ queue_cast(r, lkb, 0);
+ }
+ confirm_master(r, error);
+ break;
+
+ case -EBADR:
+ case -ENOTBLK:
+ /* find_rsb failed to find rsb or rsb wasn't master */
+ r->res_nodeid = -1;
+ lkb->lkb_nodeid = -1;
+ _request_lock(r, lkb);
+ break;
+
+ default:
+ log_error(ls, "receive_request_reply error %d", error);
+ }
+
+ unlock_rsb(r);
+ put_rsb(r);
+ out:
+ dlm_put_lkb(lkb);
+}
+
+static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ int error = ms->m_result;
+
+ /* this is the value returned from do_convert() on the master */
+
+ switch (error) {
+ case -EAGAIN:
+ /* convert would block (be queued) on remote master */
+ queue_cast(r, lkb, -EAGAIN);
+ break;
+
+ case -EINPROGRESS:
+ /* convert was queued on remote master */
+ del_lkb(r, lkb);
+ add_lkb(r, lkb, DLM_LKSTS_CONVERT);
+ break;
+
+ case 0:
+ /* convert was granted on remote master */
+ receive_flags_reply(lkb, ms);
+ grant_lock_pc(r, lkb, ms);
+ queue_cast(r, lkb, 0);
+ break;
+
+ default:
+ log_error(r->res_ls, "receive_convert_reply error %d", error);
+ }
+}
+
+static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+ struct dlm_rsb *r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ __receive_convert_reply(r, lkb, ms);
+
+ unlock_rsb(r);
+ put_rsb(r);
+}
+
+static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ int error;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error) {
+ log_error(ls, "receive_convert_reply no lkb");
+ return;
+ }
+ DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+ error = remove_from_waiters(lkb);
+ if (error) {
+ log_error(ls, "receive_convert_reply not on waiters");
+ goto out;
+ }
+
+ _receive_convert_reply(lkb, ms);
+ out:
+ dlm_put_lkb(lkb);
+}
+
+static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+ struct dlm_rsb *r = lkb->lkb_resource;
+ int error = ms->m_result;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ /* this is the value returned from do_unlock() on the master */
+
+ switch (error) {
+ case -DLM_EUNLOCK:
+ receive_flags_reply(lkb, ms);
+ remove_lock_pc(r, lkb);
+ queue_cast(r, lkb, -DLM_EUNLOCK);
+ break;
+ default:
+ log_error(r->res_ls, "receive_unlock_reply error %d", error);
+ }
+
+ unlock_rsb(r);
+ put_rsb(r);
+}
+
+static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ int error;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error) {
+ log_error(ls, "receive_unlock_reply no lkb");
+ return;
+ }
+ DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+ error = remove_from_waiters(lkb);
+ if (error) {
+ log_error(ls, "receive_unlock_reply not on waiters");
+ goto out;
+ }
+
+ _receive_unlock_reply(lkb, ms);
+ out:
+ dlm_put_lkb(lkb);
+}
+
+static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+ struct dlm_rsb *r = lkb->lkb_resource;
+ int error = ms->m_result;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ /* this is the value returned from do_cancel() on the master */
+
+ switch (error) {
+ case -DLM_ECANCEL:
+ receive_flags_reply(lkb, ms);
+ revert_lock_pc(r, lkb);
+ queue_cast(r, lkb, -DLM_ECANCEL);
+ break;
+ default:
+ log_error(r->res_ls, "receive_cancel_reply error %d", error);
+ }
+
+ unlock_rsb(r);
+ put_rsb(r);
+}
+
+static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ int error;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error) {
+ log_error(ls, "receive_cancel_reply no lkb");
+ return;
+ }
+ DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+ error = remove_from_waiters(lkb);
+ if (error) {
+ log_error(ls, "receive_cancel_reply not on waiters");
+ goto out;
+ }
+
+ _receive_cancel_reply(lkb, ms);
+ out:
+ dlm_put_lkb(lkb);
+}
+
+static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error, ret_nodeid;
+
+ error = find_lkb(ls, ms->m_lkid, &lkb);
+ if (error) {
+ log_error(ls, "receive_lookup_reply no lkb");
+ return;
+ }
+
+ error = remove_from_waiters(lkb);
+ if (error) {
+ log_error(ls, "receive_lookup_reply not on waiters");
+ goto out;
+ }
+
+ /* this is the value returned by dlm_dir_lookup on dir node
+ FIXME: will a non-zero error ever be returned? */
+ error = ms->m_result;
+
+ r = lkb->lkb_resource;
+ hold_rsb(r);
+ lock_rsb(r);
+
+ ret_nodeid = ms->m_nodeid;
+ if (ret_nodeid == dlm_our_nodeid()) {
+ r->res_nodeid = 0;
+ ret_nodeid = 0;
+ r->res_first_lkid = 0;
+ } else {
+ /* set_master() will copy res_nodeid to lkb_nodeid */
+ r->res_nodeid = ret_nodeid;
+ }
+
+ _request_lock(r, lkb);
+
+ if (!ret_nodeid)
+ process_lookup_list(r);
+
+ unlock_rsb(r);
+ put_rsb(r);
+ out:
+ dlm_put_lkb(lkb);
+}
+
+int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
+{
+ struct dlm_message *ms = (struct dlm_message *) hd;
+ struct dlm_ls *ls;
+ int error;
+
+ if (!recovery)
+ dlm_message_in(ms);
+
+ ls = dlm_find_lockspace_global(hd->h_lockspace);
+ if (!ls) {
+ log_print("drop message %d from %d for unknown lockspace %d",
+ ms->m_type, nodeid, hd->h_lockspace);
+ return -EINVAL;
+ }
+
+ /* recovery may have just ended leaving a bunch of backed-up requests
+ in the requestqueue; wait while dlm_recoverd clears them */
+
+ if (!recovery)
+ dlm_wait_requestqueue(ls);
+
+ /* recovery may have just started while there were a bunch of
+ in-flight requests -- save them in requestqueue to be processed
+ after recovery. we can't let dlm_recvd block on the recovery
+ lock. if dlm_recoverd is calling this function to clear the
+ requestqueue, it needs to be interrupted (-EINTR) if another
+ recovery operation is starting. */
+
+ while (1) {
+ if (dlm_locking_stopped(ls)) {
+ if (!recovery)
+ dlm_add_requestqueue(ls, nodeid, hd);
+ error = -EINTR;
+ goto out;
+ }
+
+ if (lock_recovery_try(ls))
+ break;
+ schedule();
+ }
+
+ switch (ms->m_type) {
+
+ /* messages sent to a master node */
+
+ case DLM_MSG_REQUEST:
+ receive_request(ls, ms);
+ break;
+
+ case DLM_MSG_CONVERT:
+ receive_convert(ls, ms);
+ break;
+
+ case DLM_MSG_UNLOCK:
+ receive_unlock(ls, ms);
+ break;
+
+ case DLM_MSG_CANCEL:
+ receive_cancel(ls, ms);
+ break;
+
+ /* messages sent from a master node (replies to above) */
+
+ case DLM_MSG_REQUEST_REPLY:
+ receive_request_reply(ls, ms);
+ break;
+
+ case DLM_MSG_CONVERT_REPLY:
+ receive_convert_reply(ls, ms);
+ break;
+
+ case DLM_MSG_UNLOCK_REPLY:
+ receive_unlock_reply(ls, ms);
+ break;
+
+ case DLM_MSG_CANCEL_REPLY:
+ receive_cancel_reply(ls, ms);
+ break;
+
+ /* messages sent from a master node (only two types of async msg) */
+
+ case DLM_MSG_GRANT:
+ receive_grant(ls, ms);
+ break;
+
+ case DLM_MSG_BAST:
+ receive_bast(ls, ms);
+ break;
+
+ /* messages sent to a dir node */
+
+ case DLM_MSG_LOOKUP:
+ receive_lookup(ls, ms);
+ break;
+
+ case DLM_MSG_REMOVE:
+ receive_remove(ls, ms);
+ break;
+
+ /* messages sent from a dir node (remove has no reply) */
+
+ case DLM_MSG_LOOKUP_REPLY:
+ receive_lookup_reply(ls, ms);
+ break;
+
+ default:
+ log_error(ls, "unknown message type %d", ms->m_type);
+ }
+
+ unlock_recovery(ls);
+ out:
+ dlm_put_lockspace(ls);
+ dlm_astd_wake();
+ return 0;
+}
+
+
+/*
+ * Recovery related
+ */
+
+static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+ if (middle_conversion(lkb)) {
+ hold_lkb(lkb);
+ ls->ls_stub_ms.m_result = -EINPROGRESS;
+ _remove_from_waiters(lkb);
+ _receive_convert_reply(lkb, &ls->ls_stub_ms);
+
+ /* Same special case as in receive_rcom_lock_args() */
+ lkb->lkb_grmode = DLM_LOCK_IV;
+ rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT);
+ unhold_lkb(lkb);
+
+ } else if (lkb->lkb_rqmode >= lkb->lkb_grmode) {
+ lkb->lkb_flags |= DLM_IFL_RESEND;
+ }
+
+ /* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down
+ conversions are async; there's no reply from the remote master */
+}
+
+/* A waiting lkb needs recovery if the master node has failed, or
+ the master node is changing (only when no directory is used) */
+
+static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+ if (dlm_is_removed(ls, lkb->lkb_nodeid))
+ return 1;
+
+ if (!dlm_no_directory(ls))
+ return 0;
+
+ if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid)
+ return 1;
+
+ return 0;
+}
+
+/* Recovery for locks that are waiting for replies from nodes that are now
+ gone. We can just complete unlocks and cancels by faking a reply from the
+ dead node. Requests and up-conversions we flag to be resent after
+ recovery. Down-conversions can just be completed with a fake reply like
+ unlocks. Conversions between PR and CW need special attention. */
+
+void dlm_recover_waiters_pre(struct dlm_ls *ls)
+{
+ struct dlm_lkb *lkb, *safe;
+
+ mutex_lock(&ls->ls_waiters_mutex);
+
+ list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
+ log_debug(ls, "pre recover waiter lkid %x type %d flags %x",
+ lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags);
+
+ /* all outstanding lookups, regardless of destination will be
+ resent after recovery is done */
+
+ if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) {
+ lkb->lkb_flags |= DLM_IFL_RESEND;
+ continue;
+ }
+
+ if (!waiter_needs_recovery(ls, lkb))
+ continue;
+
+ switch (lkb->lkb_wait_type) {
+
+ case DLM_MSG_REQUEST:
+ lkb->lkb_flags |= DLM_IFL_RESEND;
+ break;
+
+ case DLM_MSG_CONVERT:
+ recover_convert_waiter(ls, lkb);
+ break;
+
+ case DLM_MSG_UNLOCK:
+ hold_lkb(lkb);
+ ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
+ _remove_from_waiters(lkb);
+ _receive_unlock_reply(lkb, &ls->ls_stub_ms);
+ dlm_put_lkb(lkb);
+ break;
+
+ case DLM_MSG_CANCEL:
+ hold_lkb(lkb);
+ ls->ls_stub_ms.m_result = -DLM_ECANCEL;
+ _remove_from_waiters(lkb);
+ _receive_cancel_reply(lkb, &ls->ls_stub_ms);
+ dlm_put_lkb(lkb);
+ break;
+
+ default:
+ log_error(ls, "invalid lkb wait_type %d",
+ lkb->lkb_wait_type);
+ }
+ schedule();
+ }
+ mutex_unlock(&ls->ls_waiters_mutex);
+}
+
+static int remove_resend_waiter(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
+{
+ struct dlm_lkb *lkb;
+ int rv = 0;
+
+ mutex_lock(&ls->ls_waiters_mutex);
+ list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+ if (lkb->lkb_flags & DLM_IFL_RESEND) {
+ rv = lkb->lkb_wait_type;
+ _remove_from_waiters(lkb);
+ lkb->lkb_flags &= ~DLM_IFL_RESEND;
+ break;
+ }
+ }
+ mutex_unlock(&ls->ls_waiters_mutex);
+
+ if (!rv)
+ lkb = NULL;
+ *lkb_ret = lkb;
+ return rv;
+}
+
+/* Deal with lookups and lkb's marked RESEND from _pre. We may now be the
+ master or dir-node for r. Processing the lkb may result in it being placed
+ back on waiters. */
+
+int dlm_recover_waiters_post(struct dlm_ls *ls)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error = 0, mstype;
+
+ while (1) {
+ if (dlm_locking_stopped(ls)) {
+ log_debug(ls, "recover_waiters_post aborted");
+ error = -EINTR;
+ break;
+ }
+
+ mstype = remove_resend_waiter(ls, &lkb);
+ if (!mstype)
+ break;
+
+ r = lkb->lkb_resource;
+
+ log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
+ lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
+
+ switch (mstype) {
+
+ case DLM_MSG_LOOKUP:
+ hold_rsb(r);
+ lock_rsb(r);
+ _request_lock(r, lkb);
+ if (is_master(r))
+ confirm_master(r, 0);
+ unlock_rsb(r);
+ put_rsb(r);
+ break;
+
+ case DLM_MSG_REQUEST:
+ hold_rsb(r);
+ lock_rsb(r);
+ _request_lock(r, lkb);
+ if (is_master(r))
+ confirm_master(r, 0);
+ unlock_rsb(r);
+ put_rsb(r);
+ break;
+
+ case DLM_MSG_CONVERT:
+ hold_rsb(r);
+ lock_rsb(r);
+ _convert_lock(r, lkb);
+ unlock_rsb(r);
+ put_rsb(r);
+ break;
+
+ default:
+ log_error(ls, "recover_waiters_post type %d", mstype);
+ }
+ }
+
+ return error;
+}
+
+static void purge_queue(struct dlm_rsb *r, struct list_head *queue,
+ int (*test)(struct dlm_ls *ls, struct dlm_lkb *lkb))
+{
+ struct dlm_ls *ls = r->res_ls;
+ struct dlm_lkb *lkb, *safe;
+
+ list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
+ if (test(ls, lkb)) {
+ rsb_set_flag(r, RSB_LOCKS_PURGED);
+ del_lkb(r, lkb);
+ /* this put should free the lkb */
+ if (!dlm_put_lkb(lkb))
+ log_error(ls, "purged lkb not released");
+ }
+ }
+}
+
+static int purge_dead_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+ return (is_master_copy(lkb) && dlm_is_removed(ls, lkb->lkb_nodeid));
+}
+
+static int purge_mstcpy_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+ return is_master_copy(lkb);
+}
+
+static void purge_dead_locks(struct dlm_rsb *r)
+{
+ purge_queue(r, &r->res_grantqueue, &purge_dead_test);
+ purge_queue(r, &r->res_convertqueue, &purge_dead_test);
+ purge_queue(r, &r->res_waitqueue, &purge_dead_test);
+}
+
+void dlm_purge_mstcpy_locks(struct dlm_rsb *r)
+{
+ purge_queue(r, &r->res_grantqueue, &purge_mstcpy_test);
+ purge_queue(r, &r->res_convertqueue, &purge_mstcpy_test);
+ purge_queue(r, &r->res_waitqueue, &purge_mstcpy_test);
+}
+
+/* Get rid of locks held by nodes that are gone. */
+
+int dlm_purge_locks(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r;
+
+ log_debug(ls, "dlm_purge_locks");
+
+ down_write(&ls->ls_root_sem);
+ list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+ hold_rsb(r);
+ lock_rsb(r);
+ if (is_master(r))
+ purge_dead_locks(r);
+ unlock_rsb(r);
+ unhold_rsb(r);
+
+ schedule();
+ }
+ up_write(&ls->ls_root_sem);
+
+ return 0;
+}
+
+static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
+{
+ struct dlm_rsb *r, *r_ret = NULL;
+
+ read_lock(&ls->ls_rsbtbl[bucket].lock);
+ list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) {
+ if (!rsb_flag(r, RSB_LOCKS_PURGED))
+ continue;
+ hold_rsb(r);
+ rsb_clear_flag(r, RSB_LOCKS_PURGED);
+ r_ret = r;
+ break;
+ }
+ read_unlock(&ls->ls_rsbtbl[bucket].lock);
+ return r_ret;
+}
+
+void dlm_grant_after_purge(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r;
+ int bucket = 0;
+
+ while (1) {
+ r = find_purged_rsb(ls, bucket);
+ if (!r) {
+ if (bucket == ls->ls_rsbtbl_size - 1)
+ break;
+ bucket++;
+ continue;
+ }
+ lock_rsb(r);
+ if (is_master(r)) {
+ grant_pending_locks(r);
+ confirm_master(r, 0);
+ }
+ unlock_rsb(r);
+ put_rsb(r);
+ schedule();
+ }
+}
+
+static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
+ uint32_t remid)
+{
+ struct dlm_lkb *lkb;
+
+ list_for_each_entry(lkb, head, lkb_statequeue) {
+ if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid)
+ return lkb;
+ }
+ return NULL;
+}
+
+static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid,
+ uint32_t remid)
+{
+ struct dlm_lkb *lkb;
+
+ lkb = search_remid_list(&r->res_grantqueue, nodeid, remid);
+ if (lkb)
+ return lkb;
+ lkb = search_remid_list(&r->res_convertqueue, nodeid, remid);
+ if (lkb)
+ return lkb;
+ lkb = search_remid_list(&r->res_waitqueue, nodeid, remid);
+ if (lkb)
+ return lkb;
+ return NULL;
+}
+
+static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_rsb *r, struct dlm_rcom *rc)
+{
+ struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+ int lvblen;
+
+ lkb->lkb_nodeid = rc->rc_header.h_nodeid;
+ lkb->lkb_ownpid = rl->rl_ownpid;
+ lkb->lkb_remid = rl->rl_lkid;
+ lkb->lkb_exflags = rl->rl_exflags;
+ lkb->lkb_flags = rl->rl_flags & 0x0000FFFF;
+ lkb->lkb_flags |= DLM_IFL_MSTCPY;
+ lkb->lkb_lvbseq = rl->rl_lvbseq;
+ lkb->lkb_rqmode = rl->rl_rqmode;
+ lkb->lkb_grmode = rl->rl_grmode;
+ /* don't set lkb_status because add_lkb wants to itself */
+
+ lkb->lkb_bastaddr = (void *) (long) (rl->rl_asts & AST_BAST);
+ lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
+
+ if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
+ lkb->lkb_lvbptr = allocate_lvb(ls);
+ if (!lkb->lkb_lvbptr)
+ return -ENOMEM;
+ lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
+ sizeof(struct rcom_lock);
+ memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen);
+ }
+
+ /* Conversions between PR and CW (middle modes) need special handling.
+ The real granted mode of these converting locks cannot be determined
+ until all locks have been rebuilt on the rsb (recover_conversion) */
+
+ if (rl->rl_wait_type == DLM_MSG_CONVERT && middle_conversion(lkb)) {
+ rl->rl_status = DLM_LKSTS_CONVERT;
+ lkb->lkb_grmode = DLM_LOCK_IV;
+ rsb_set_flag(r, RSB_RECOVER_CONVERT);
+ }
+
+ return 0;
+}
+
+/* This lkb may have been recovered in a previous aborted recovery so we need
+ to check if the rsb already has an lkb with the given remote nodeid/lkid.
+ If so we just send back a standard reply. If not, we create a new lkb with
+ the given values and send back our lkid. We send back our lkid by sending
+ back the rcom_lock struct we got but with the remid field filled in. */
+
+int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+ struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+ struct dlm_rsb *r;
+ struct dlm_lkb *lkb;
+ int error;
+
+ if (rl->rl_parent_lkid) {
+ error = -EOPNOTSUPP;
+ goto out;
+ }
+
+ error = find_rsb(ls, rl->rl_name, rl->rl_namelen, R_MASTER, &r);
+ if (error)
+ goto out;
+
+ lock_rsb(r);
+
+ lkb = search_remid(r, rc->rc_header.h_nodeid, rl->rl_lkid);
+ if (lkb) {
+ error = -EEXIST;
+ goto out_remid;
+ }
+
+ error = create_lkb(ls, &lkb);
+ if (error)
+ goto out_unlock;
+
+ error = receive_rcom_lock_args(ls, lkb, r, rc);
+ if (error) {
+ __put_lkb(ls, lkb);
+ goto out_unlock;
+ }
+
+ attach_lkb(r, lkb);
+ add_lkb(r, lkb, rl->rl_status);
+ error = 0;
+
+ out_remid:
+ /* this is the new value returned to the lock holder for
+ saving in its process-copy lkb */
+ rl->rl_remid = lkb->lkb_id;
+
+ out_unlock:
+ unlock_rsb(r);
+ put_rsb(r);
+ out:
+ if (error)
+ log_print("recover_master_copy %d %x", error, rl->rl_lkid);
+ rl->rl_result = error;
+ return error;
+}
+
+int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+ struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+ struct dlm_rsb *r;
+ struct dlm_lkb *lkb;
+ int error;
+
+ error = find_lkb(ls, rl->rl_lkid, &lkb);
+ if (error) {
+ log_error(ls, "recover_process_copy no lkid %x", rl->rl_lkid);
+ return error;
+ }
+
+ DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+ error = rl->rl_result;
+
+ r = lkb->lkb_resource;
+ hold_rsb(r);
+ lock_rsb(r);
+
+ switch (error) {
+ case -EEXIST:
+ log_debug(ls, "master copy exists %x", lkb->lkb_id);
+ /* fall through */
+ case 0:
+ lkb->lkb_remid = rl->rl_remid;
+ break;
+ default:
+ log_error(ls, "dlm_recover_process_copy unknown error %d %x",
+ error, lkb->lkb_id);
+ }
+
+ /* an ack for dlm_recover_locks() which waits for replies from
+ all the locks it sends to new masters */
+ dlm_recovered_lock(r);
+
+ unlock_rsb(r);
+ put_rsb(r);
+ dlm_put_lkb(lkb);
+
+ return 0;
+}
+
+int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
+ int mode, uint32_t flags, void *name, unsigned int namelen,
+ uint32_t parent_lkid)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_args args;
+ int error;
+
+ lock_recovery(ls);
+
+ error = create_lkb(ls, &lkb);
+ if (error) {
+ kfree(ua);
+ goto out;
+ }
+
+ if (flags & DLM_LKF_VALBLK) {
+ ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
+ if (!ua->lksb.sb_lvbptr) {
+ kfree(ua);
+ __put_lkb(ls, lkb);
+ error = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /* After ua is attached to lkb it will be freed by free_lkb().
+ When DLM_IFL_USER is set, the dlm knows that this is a userspace
+ lock and that lkb_astparam is the dlm_user_args structure. */
+
+ error = set_lock_args(mode, &ua->lksb, flags, namelen, parent_lkid,
+ DLM_FAKE_USER_AST, ua, DLM_FAKE_USER_AST, &args);
+ lkb->lkb_flags |= DLM_IFL_USER;
+ ua->old_mode = DLM_LOCK_IV;
+
+ if (error) {
+ __put_lkb(ls, lkb);
+ goto out;
+ }
+
+ error = request_lock(ls, lkb, name, namelen, &args);
+
+ switch (error) {
+ case 0:
+ break;
+ case -EINPROGRESS:
+ error = 0;
+ break;
+ case -EAGAIN:
+ error = 0;
+ /* fall through */
+ default:
+ __put_lkb(ls, lkb);
+ goto out;
+ }
+
+ /* add this new lkb to the per-process list of locks */
+ spin_lock(&ua->proc->locks_spin);
+ kref_get(&lkb->lkb_ref);
+ list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
+ spin_unlock(&ua->proc->locks_spin);
+ out:
+ unlock_recovery(ls);
+ return error;
+}
+
+int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ int mode, uint32_t flags, uint32_t lkid, char *lvb_in)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_args args;
+ struct dlm_user_args *ua;
+ int error;
+
+ lock_recovery(ls);
+
+ error = find_lkb(ls, lkid, &lkb);
+ if (error)
+ goto out;
+
+ /* user can change the params on its lock when it converts it, or
+ add an lvb that didn't exist before */
+
+ ua = (struct dlm_user_args *)lkb->lkb_astparam;
+
+ if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
+ ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
+ if (!ua->lksb.sb_lvbptr) {
+ error = -ENOMEM;
+ goto out_put;
+ }
+ }
+ if (lvb_in && ua->lksb.sb_lvbptr)
+ memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
+
+ ua->castparam = ua_tmp->castparam;
+ ua->castaddr = ua_tmp->castaddr;
+ ua->bastparam = ua_tmp->bastparam;
+ ua->bastaddr = ua_tmp->bastaddr;
+ ua->user_lksb = ua_tmp->user_lksb;
+ ua->old_mode = lkb->lkb_grmode;
+
+ error = set_lock_args(mode, &ua->lksb, flags, 0, 0, DLM_FAKE_USER_AST,
+ ua, DLM_FAKE_USER_AST, &args);
+ if (error)
+ goto out_put;
+
+ error = convert_lock(ls, lkb, &args);
+
+ if (error == -EINPROGRESS || error == -EAGAIN)
+ error = 0;
+ out_put:
+ dlm_put_lkb(lkb);
+ out:
+ unlock_recovery(ls);
+ kfree(ua_tmp);
+ return error;
+}
+
+int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ uint32_t flags, uint32_t lkid, char *lvb_in)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_args args;
+ struct dlm_user_args *ua;
+ int error;
+
+ lock_recovery(ls);
+
+ error = find_lkb(ls, lkid, &lkb);
+ if (error)
+ goto out;
+
+ ua = (struct dlm_user_args *)lkb->lkb_astparam;
+
+ if (lvb_in && ua->lksb.sb_lvbptr)
+ memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
+ ua->castparam = ua_tmp->castparam;
+ ua->user_lksb = ua_tmp->user_lksb;
+
+ error = set_unlock_args(flags, ua, &args);
+ if (error)
+ goto out_put;
+
+ error = unlock_lock(ls, lkb, &args);
+
+ if (error == -DLM_EUNLOCK)
+ error = 0;
+ if (error)
+ goto out_put;
+
+ spin_lock(&ua->proc->locks_spin);
+ list_del_init(&lkb->lkb_ownqueue);
+ spin_unlock(&ua->proc->locks_spin);
+
+ /* this removes the reference for the proc->locks list added by
+ dlm_user_request */
+ unhold_lkb(lkb);
+ out_put:
+ dlm_put_lkb(lkb);
+ out:
+ unlock_recovery(ls);
+ return error;
+}
+
+int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ uint32_t flags, uint32_t lkid)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_args args;
+ struct dlm_user_args *ua;
+ int error;
+
+ lock_recovery(ls);
+
+ error = find_lkb(ls, lkid, &lkb);
+ if (error)
+ goto out;
+
+ ua = (struct dlm_user_args *)lkb->lkb_astparam;
+ ua->castparam = ua_tmp->castparam;
+ ua->user_lksb = ua_tmp->user_lksb;
+
+ error = set_unlock_args(flags, ua, &args);
+ if (error)
+ goto out_put;
+
+ error = cancel_lock(ls, lkb, &args);
+
+ if (error == -DLM_ECANCEL)
+ error = 0;
+ if (error)
+ goto out_put;
+
+ /* this lkb was removed from the WAITING queue */
+ if (lkb->lkb_grmode == DLM_LOCK_IV) {
+ spin_lock(&ua->proc->locks_spin);
+ list_del_init(&lkb->lkb_ownqueue);
+ spin_unlock(&ua->proc->locks_spin);
+ unhold_lkb(lkb);
+ }
+ out_put:
+ dlm_put_lkb(lkb);
+ out:
+ unlock_recovery(ls);
+ return error;
+}
+
+static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+ struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam;
+
+ if (ua->lksb.sb_lvbptr)
+ kfree(ua->lksb.sb_lvbptr);
+ kfree(ua);
+ lkb->lkb_astparam = (long)NULL;
+
+ /* TODO: propogate to master if needed */
+ return 0;
+}
+
+/* The force flag allows the unlock to go ahead even if the lkb isn't granted.
+ Regardless of what rsb queue the lock is on, it's removed and freed. */
+
+static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+ struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam;
+ struct dlm_args args;
+ int error;
+
+ /* FIXME: we need to handle the case where the lkb is in limbo
+ while the rsb is being looked up, currently we assert in
+ _unlock_lock/is_remote because rsb nodeid is -1. */
+
+ set_unlock_args(DLM_LKF_FORCEUNLOCK, ua, &args);
+
+ error = unlock_lock(ls, lkb, &args);
+ if (error == -DLM_EUNLOCK)
+ error = 0;
+ return error;
+}
+
+/* The ls_clear_proc_locks mutex protects against dlm_user_add_asts() which
+ 1) references lkb->ua which we free here and 2) adds lkbs to proc->asts,
+ which we clear here. */
+
+/* proc CLOSING flag is set so no more device_reads should look at proc->asts
+ list, and no more device_writes should add lkb's to proc->locks list; so we
+ shouldn't need to take asts_spin or locks_spin here. this assumes that
+ device reads/writes/closes are serialized -- FIXME: we may need to serialize
+ them ourself. */
+
+void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
+{
+ struct dlm_lkb *lkb, *safe;
+
+ lock_recovery(ls);
+ mutex_lock(&ls->ls_clear_proc_locks);
+
+ list_for_each_entry_safe(lkb, safe, &proc->locks, lkb_ownqueue) {
+ if (lkb->lkb_ast_type) {
+ list_del(&lkb->lkb_astqueue);
+ unhold_lkb(lkb);
+ }
+
+ list_del_init(&lkb->lkb_ownqueue);
+
+ if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) {
+ lkb->lkb_flags |= DLM_IFL_ORPHAN;
+ orphan_proc_lock(ls, lkb);
+ } else {
+ lkb->lkb_flags |= DLM_IFL_DEAD;
+ unlock_proc_lock(ls, lkb);
+ }
+
+ /* this removes the reference for the proc->locks list
+ added by dlm_user_request, it may result in the lkb
+ being freed */
+
+ dlm_put_lkb(lkb);
+ }
+ mutex_unlock(&ls->ls_clear_proc_locks);
+ unlock_recovery(ls);
+}
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
new file mode 100644
index 000000000000..0843a3073ec3
--- /dev/null
+++ b/fs/dlm/lock.h
@@ -0,0 +1,62 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOCK_DOT_H__
+#define __LOCK_DOT_H__
+
+void dlm_print_rsb(struct dlm_rsb *r);
+void dlm_dump_rsb(struct dlm_rsb *r);
+void dlm_print_lkb(struct dlm_lkb *lkb);
+int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery);
+int dlm_modes_compat(int mode1, int mode2);
+int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
+ unsigned int flags, struct dlm_rsb **r_ret);
+void dlm_put_rsb(struct dlm_rsb *r);
+void dlm_hold_rsb(struct dlm_rsb *r);
+int dlm_put_lkb(struct dlm_lkb *lkb);
+void dlm_scan_rsbs(struct dlm_ls *ls);
+
+int dlm_purge_locks(struct dlm_ls *ls);
+void dlm_purge_mstcpy_locks(struct dlm_rsb *r);
+void dlm_grant_after_purge(struct dlm_ls *ls);
+int dlm_recover_waiters_post(struct dlm_ls *ls);
+void dlm_recover_waiters_pre(struct dlm_ls *ls);
+int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
+int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
+
+int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode,
+ uint32_t flags, void *name, unsigned int namelen, uint32_t parent_lkid);
+int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ int mode, uint32_t flags, uint32_t lkid, char *lvb_in);
+int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ uint32_t flags, uint32_t lkid, char *lvb_in);
+int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ uint32_t flags, uint32_t lkid);
+void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc);
+
+static inline int is_master(struct dlm_rsb *r)
+{
+ return !r->res_nodeid;
+}
+
+static inline void lock_rsb(struct dlm_rsb *r)
+{
+ mutex_lock(&r->res_mutex);
+}
+
+static inline void unlock_rsb(struct dlm_rsb *r)
+{
+ mutex_unlock(&r->res_mutex);
+}
+
+#endif
+
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
new file mode 100644
index 000000000000..109333c8ecb9
--- /dev/null
+++ b/fs/dlm/lockspace.c
@@ -0,0 +1,717 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "recoverd.h"
+#include "ast.h"
+#include "dir.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "memory.h"
+#include "lock.h"
+#include "recover.h"
+
+#ifdef CONFIG_DLM_DEBUG
+int dlm_create_debug_file(struct dlm_ls *ls);
+void dlm_delete_debug_file(struct dlm_ls *ls);
+#else
+static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
+static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
+#endif
+
+static int ls_count;
+static struct mutex ls_lock;
+static struct list_head lslist;
+static spinlock_t lslist_lock;
+static struct task_struct * scand_task;
+
+
+static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+ ssize_t ret = len;
+ int n = simple_strtol(buf, NULL, 0);
+
+ switch (n) {
+ case 0:
+ dlm_ls_stop(ls);
+ break;
+ case 1:
+ dlm_ls_start(ls);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+ ls->ls_uevent_result = simple_strtol(buf, NULL, 0);
+ set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags);
+ wake_up(&ls->ls_uevent_wait);
+ return len;
+}
+
+static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%u\n", ls->ls_global_id);
+}
+
+static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+ ls->ls_global_id = simple_strtoul(buf, NULL, 0);
+ return len;
+}
+
+static ssize_t dlm_recover_status_show(struct dlm_ls *ls, char *buf)
+{
+ uint32_t status = dlm_recover_status(ls);
+ return snprintf(buf, PAGE_SIZE, "%x\n", status);
+}
+
+static ssize_t dlm_recover_nodeid_show(struct dlm_ls *ls, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", ls->ls_recover_nodeid);
+}
+
+struct dlm_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct dlm_ls *, char *);
+ ssize_t (*store)(struct dlm_ls *, const char *, size_t);
+};
+
+static struct dlm_attr dlm_attr_control = {
+ .attr = {.name = "control", .mode = S_IWUSR},
+ .store = dlm_control_store
+};
+
+static struct dlm_attr dlm_attr_event = {
+ .attr = {.name = "event_done", .mode = S_IWUSR},
+ .store = dlm_event_store
+};
+
+static struct dlm_attr dlm_attr_id = {
+ .attr = {.name = "id", .mode = S_IRUGO | S_IWUSR},
+ .show = dlm_id_show,
+ .store = dlm_id_store
+};
+
+static struct dlm_attr dlm_attr_recover_status = {
+ .attr = {.name = "recover_status", .mode = S_IRUGO},
+ .show = dlm_recover_status_show
+};
+
+static struct dlm_attr dlm_attr_recover_nodeid = {
+ .attr = {.name = "recover_nodeid", .mode = S_IRUGO},
+ .show = dlm_recover_nodeid_show
+};
+
+static struct attribute *dlm_attrs[] = {
+ &dlm_attr_control.attr,
+ &dlm_attr_event.attr,
+ &dlm_attr_id.attr,
+ &dlm_attr_recover_status.attr,
+ &dlm_attr_recover_nodeid.attr,
+ NULL,
+};
+
+static ssize_t dlm_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
+ struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
+ return a->show ? a->show(ls, buf) : 0;
+}
+
+static ssize_t dlm_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
+ struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
+ return a->store ? a->store(ls, buf, len) : len;
+}
+
+static struct sysfs_ops dlm_attr_ops = {
+ .show = dlm_attr_show,
+ .store = dlm_attr_store,
+};
+
+static struct kobj_type dlm_ktype = {
+ .default_attrs = dlm_attrs,
+ .sysfs_ops = &dlm_attr_ops,
+};
+
+static struct kset dlm_kset = {
+ .subsys = &kernel_subsys,
+ .kobj = {.name = "dlm",},
+ .ktype = &dlm_ktype,
+};
+
+static int kobject_setup(struct dlm_ls *ls)
+{
+ char lsname[DLM_LOCKSPACE_LEN];
+ int error;
+
+ memset(lsname, 0, DLM_LOCKSPACE_LEN);
+ snprintf(lsname, DLM_LOCKSPACE_LEN, "%s", ls->ls_name);
+
+ error = kobject_set_name(&ls->ls_kobj, "%s", lsname);
+ if (error)
+ return error;
+
+ ls->ls_kobj.kset = &dlm_kset;
+ ls->ls_kobj.ktype = &dlm_ktype;
+ return 0;
+}
+
+static int do_uevent(struct dlm_ls *ls, int in)
+{
+ int error;
+
+ if (in)
+ kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE);
+ else
+ kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
+
+ error = wait_event_interruptible(ls->ls_uevent_wait,
+ test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags));
+ if (error)
+ goto out;
+
+ error = ls->ls_uevent_result;
+ out:
+ return error;
+}
+
+
+int dlm_lockspace_init(void)
+{
+ int error;
+
+ ls_count = 0;
+ mutex_init(&ls_lock);
+ INIT_LIST_HEAD(&lslist);
+ spin_lock_init(&lslist_lock);
+
+ error = kset_register(&dlm_kset);
+ if (error)
+ printk("dlm_lockspace_init: cannot register kset %d\n", error);
+ return error;
+}
+
+void dlm_lockspace_exit(void)
+{
+ kset_unregister(&dlm_kset);
+}
+
+static int dlm_scand(void *data)
+{
+ struct dlm_ls *ls;
+
+ while (!kthread_should_stop()) {
+ list_for_each_entry(ls, &lslist, ls_list)
+ dlm_scan_rsbs(ls);
+ schedule_timeout_interruptible(dlm_config.scan_secs * HZ);
+ }
+ return 0;
+}
+
+static int dlm_scand_start(void)
+{
+ struct task_struct *p;
+ int error = 0;
+
+ p = kthread_run(dlm_scand, NULL, "dlm_scand");
+ if (IS_ERR(p))
+ error = PTR_ERR(p);
+ else
+ scand_task = p;
+ return error;
+}
+
+static void dlm_scand_stop(void)
+{
+ kthread_stop(scand_task);
+}
+
+static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen)
+{
+ struct dlm_ls *ls;
+
+ spin_lock(&lslist_lock);
+
+ list_for_each_entry(ls, &lslist, ls_list) {
+ if (ls->ls_namelen == namelen &&
+ memcmp(ls->ls_name, name, namelen) == 0)
+ goto out;
+ }
+ ls = NULL;
+ out:
+ spin_unlock(&lslist_lock);
+ return ls;
+}
+
+struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
+{
+ struct dlm_ls *ls;
+
+ spin_lock(&lslist_lock);
+
+ list_for_each_entry(ls, &lslist, ls_list) {
+ if (ls->ls_global_id == id) {
+ ls->ls_count++;
+ goto out;
+ }
+ }
+ ls = NULL;
+ out:
+ spin_unlock(&lslist_lock);
+ return ls;
+}
+
+struct dlm_ls *dlm_find_lockspace_local(dlm_lockspace_t *lockspace)
+{
+ struct dlm_ls *ls;
+
+ spin_lock(&lslist_lock);
+ list_for_each_entry(ls, &lslist, ls_list) {
+ if (ls->ls_local_handle == lockspace) {
+ ls->ls_count++;
+ goto out;
+ }
+ }
+ ls = NULL;
+ out:
+ spin_unlock(&lslist_lock);
+ return ls;
+}
+
+struct dlm_ls *dlm_find_lockspace_device(int minor)
+{
+ struct dlm_ls *ls;
+
+ spin_lock(&lslist_lock);
+ list_for_each_entry(ls, &lslist, ls_list) {
+ if (ls->ls_device.minor == minor) {
+ ls->ls_count++;
+ goto out;
+ }
+ }
+ ls = NULL;
+ out:
+ spin_unlock(&lslist_lock);
+ return ls;
+}
+
+void dlm_put_lockspace(struct dlm_ls *ls)
+{
+ spin_lock(&lslist_lock);
+ ls->ls_count--;
+ spin_unlock(&lslist_lock);
+}
+
+static void remove_lockspace(struct dlm_ls *ls)
+{
+ for (;;) {
+ spin_lock(&lslist_lock);
+ if (ls->ls_count == 0) {
+ list_del(&ls->ls_list);
+ spin_unlock(&lslist_lock);
+ return;
+ }
+ spin_unlock(&lslist_lock);
+ ssleep(1);
+ }
+}
+
+static int threads_start(void)
+{
+ int error;
+
+ /* Thread which process lock requests for all lockspace's */
+ error = dlm_astd_start();
+ if (error) {
+ log_print("cannot start dlm_astd thread %d", error);
+ goto fail;
+ }
+
+ error = dlm_scand_start();
+ if (error) {
+ log_print("cannot start dlm_scand thread %d", error);
+ goto astd_fail;
+ }
+
+ /* Thread for sending/receiving messages for all lockspace's */
+ error = dlm_lowcomms_start();
+ if (error) {
+ log_print("cannot start dlm lowcomms %d", error);
+ goto scand_fail;
+ }
+
+ return 0;
+
+ scand_fail:
+ dlm_scand_stop();
+ astd_fail:
+ dlm_astd_stop();
+ fail:
+ return error;
+}
+
+static void threads_stop(void)
+{
+ dlm_scand_stop();
+ dlm_lowcomms_stop();
+ dlm_astd_stop();
+}
+
+static int new_lockspace(char *name, int namelen, void **lockspace,
+ uint32_t flags, int lvblen)
+{
+ struct dlm_ls *ls;
+ int i, size, error = -ENOMEM;
+
+ if (namelen > DLM_LOCKSPACE_LEN)
+ return -EINVAL;
+
+ if (!lvblen || (lvblen % 8))
+ return -EINVAL;
+
+ if (!try_module_get(THIS_MODULE))
+ return -EINVAL;
+
+ ls = dlm_find_lockspace_name(name, namelen);
+ if (ls) {
+ *lockspace = ls;
+ module_put(THIS_MODULE);
+ return -EEXIST;
+ }
+
+ ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
+ if (!ls)
+ goto out;
+ memcpy(ls->ls_name, name, namelen);
+ ls->ls_namelen = namelen;
+ ls->ls_exflags = flags;
+ ls->ls_lvblen = lvblen;
+ ls->ls_count = 0;
+ ls->ls_flags = 0;
+
+ size = dlm_config.rsbtbl_size;
+ ls->ls_rsbtbl_size = size;
+
+ ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
+ if (!ls->ls_rsbtbl)
+ goto out_lsfree;
+ for (i = 0; i < size; i++) {
+ INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
+ INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
+ rwlock_init(&ls->ls_rsbtbl[i].lock);
+ }
+
+ size = dlm_config.lkbtbl_size;
+ ls->ls_lkbtbl_size = size;
+
+ ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
+ if (!ls->ls_lkbtbl)
+ goto out_rsbfree;
+ for (i = 0; i < size; i++) {
+ INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list);
+ rwlock_init(&ls->ls_lkbtbl[i].lock);
+ ls->ls_lkbtbl[i].counter = 1;
+ }
+
+ size = dlm_config.dirtbl_size;
+ ls->ls_dirtbl_size = size;
+
+ ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
+ if (!ls->ls_dirtbl)
+ goto out_lkbfree;
+ for (i = 0; i < size; i++) {
+ INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
+ rwlock_init(&ls->ls_dirtbl[i].lock);
+ }
+
+ INIT_LIST_HEAD(&ls->ls_waiters);
+ mutex_init(&ls->ls_waiters_mutex);
+
+ INIT_LIST_HEAD(&ls->ls_nodes);
+ INIT_LIST_HEAD(&ls->ls_nodes_gone);
+ ls->ls_num_nodes = 0;
+ ls->ls_low_nodeid = 0;
+ ls->ls_total_weight = 0;
+ ls->ls_node_array = NULL;
+
+ memset(&ls->ls_stub_rsb, 0, sizeof(struct dlm_rsb));
+ ls->ls_stub_rsb.res_ls = ls;
+
+ ls->ls_debug_rsb_dentry = NULL;
+ ls->ls_debug_waiters_dentry = NULL;
+
+ init_waitqueue_head(&ls->ls_uevent_wait);
+ ls->ls_uevent_result = 0;
+
+ ls->ls_recoverd_task = NULL;
+ mutex_init(&ls->ls_recoverd_active);
+ spin_lock_init(&ls->ls_recover_lock);
+ ls->ls_recover_status = 0;
+ ls->ls_recover_seq = 0;
+ ls->ls_recover_args = NULL;
+ init_rwsem(&ls->ls_in_recovery);
+ INIT_LIST_HEAD(&ls->ls_requestqueue);
+ mutex_init(&ls->ls_requestqueue_mutex);
+ mutex_init(&ls->ls_clear_proc_locks);
+
+ ls->ls_recover_buf = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
+ if (!ls->ls_recover_buf)
+ goto out_dirfree;
+
+ INIT_LIST_HEAD(&ls->ls_recover_list);
+ spin_lock_init(&ls->ls_recover_list_lock);
+ ls->ls_recover_list_count = 0;
+ ls->ls_local_handle = ls;
+ init_waitqueue_head(&ls->ls_wait_general);
+ INIT_LIST_HEAD(&ls->ls_root_list);
+ init_rwsem(&ls->ls_root_sem);
+
+ down_write(&ls->ls_in_recovery);
+
+ spin_lock(&lslist_lock);
+ list_add(&ls->ls_list, &lslist);
+ spin_unlock(&lslist_lock);
+
+ /* needs to find ls in lslist */
+ error = dlm_recoverd_start(ls);
+ if (error) {
+ log_error(ls, "can't start dlm_recoverd %d", error);
+ goto out_rcomfree;
+ }
+
+ dlm_create_debug_file(ls);
+
+ error = kobject_setup(ls);
+ if (error)
+ goto out_del;
+
+ error = kobject_register(&ls->ls_kobj);
+ if (error)
+ goto out_del;
+
+ error = do_uevent(ls, 1);
+ if (error)
+ goto out_unreg;
+
+ *lockspace = ls;
+ return 0;
+
+ out_unreg:
+ kobject_unregister(&ls->ls_kobj);
+ out_del:
+ dlm_delete_debug_file(ls);
+ dlm_recoverd_stop(ls);
+ out_rcomfree:
+ spin_lock(&lslist_lock);
+ list_del(&ls->ls_list);
+ spin_unlock(&lslist_lock);
+ kfree(ls->ls_recover_buf);
+ out_dirfree:
+ kfree(ls->ls_dirtbl);
+ out_lkbfree:
+ kfree(ls->ls_lkbtbl);
+ out_rsbfree:
+ kfree(ls->ls_rsbtbl);
+ out_lsfree:
+ kfree(ls);
+ out:
+ module_put(THIS_MODULE);
+ return error;
+}
+
+int dlm_new_lockspace(char *name, int namelen, void **lockspace,
+ uint32_t flags, int lvblen)
+{
+ int error = 0;
+
+ mutex_lock(&ls_lock);
+ if (!ls_count)
+ error = threads_start();
+ if (error)
+ goto out;
+
+ error = new_lockspace(name, namelen, lockspace, flags, lvblen);
+ if (!error)
+ ls_count++;
+ out:
+ mutex_unlock(&ls_lock);
+ return error;
+}
+
+/* Return 1 if the lockspace still has active remote locks,
+ * 2 if the lockspace still has active local locks.
+ */
+static int lockspace_busy(struct dlm_ls *ls)
+{
+ int i, lkb_found = 0;
+ struct dlm_lkb *lkb;
+
+ /* NOTE: We check the lockidtbl here rather than the resource table.
+ This is because there may be LKBs queued as ASTs that have been
+ unlinked from their RSBs and are pending deletion once the AST has
+ been delivered */
+
+ for (i = 0; i < ls->ls_lkbtbl_size; i++) {
+ read_lock(&ls->ls_lkbtbl[i].lock);
+ if (!list_empty(&ls->ls_lkbtbl[i].list)) {
+ lkb_found = 1;
+ list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list,
+ lkb_idtbl_list) {
+ if (!lkb->lkb_nodeid) {
+ read_unlock(&ls->ls_lkbtbl[i].lock);
+ return 2;
+ }
+ }
+ }
+ read_unlock(&ls->ls_lkbtbl[i].lock);
+ }
+ return lkb_found;
+}
+
+static int release_lockspace(struct dlm_ls *ls, int force)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *rsb;
+ struct list_head *head;
+ int i;
+ int busy = lockspace_busy(ls);
+
+ if (busy > force)
+ return -EBUSY;
+
+ if (force < 3)
+ do_uevent(ls, 0);
+
+ dlm_recoverd_stop(ls);
+
+ remove_lockspace(ls);
+
+ dlm_delete_debug_file(ls);
+
+ dlm_astd_suspend();
+
+ kfree(ls->ls_recover_buf);
+
+ /*
+ * Free direntry structs.
+ */
+
+ dlm_dir_clear(ls);
+ kfree(ls->ls_dirtbl);
+
+ /*
+ * Free all lkb's on lkbtbl[] lists.
+ */
+
+ for (i = 0; i < ls->ls_lkbtbl_size; i++) {
+ head = &ls->ls_lkbtbl[i].list;
+ while (!list_empty(head)) {
+ lkb = list_entry(head->next, struct dlm_lkb,
+ lkb_idtbl_list);
+
+ list_del(&lkb->lkb_idtbl_list);
+
+ dlm_del_ast(lkb);
+
+ if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
+ free_lvb(lkb->lkb_lvbptr);
+
+ free_lkb(lkb);
+ }
+ }
+ dlm_astd_resume();
+
+ kfree(ls->ls_lkbtbl);
+
+ /*
+ * Free all rsb's on rsbtbl[] lists
+ */
+
+ for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+ head = &ls->ls_rsbtbl[i].list;
+ while (!list_empty(head)) {
+ rsb = list_entry(head->next, struct dlm_rsb,
+ res_hashchain);
+
+ list_del(&rsb->res_hashchain);
+ free_rsb(rsb);
+ }
+
+ head = &ls->ls_rsbtbl[i].toss;
+ while (!list_empty(head)) {
+ rsb = list_entry(head->next, struct dlm_rsb,
+ res_hashchain);
+ list_del(&rsb->res_hashchain);
+ free_rsb(rsb);
+ }
+ }
+
+ kfree(ls->ls_rsbtbl);
+
+ /*
+ * Free structures on any other lists
+ */
+
+ kfree(ls->ls_recover_args);
+ dlm_clear_free_entries(ls);
+ dlm_clear_members(ls);
+ dlm_clear_members_gone(ls);
+ kfree(ls->ls_node_array);
+ kobject_unregister(&ls->ls_kobj);
+ kfree(ls);
+
+ mutex_lock(&ls_lock);
+ ls_count--;
+ if (!ls_count)
+ threads_stop();
+ mutex_unlock(&ls_lock);
+
+ module_put(THIS_MODULE);
+ return 0;
+}
+
+/*
+ * Called when a system has released all its locks and is not going to use the
+ * lockspace any longer. We free everything we're managing for this lockspace.
+ * Remaining nodes will go through the recovery process as if we'd died. The
+ * lockspace must continue to function as usual, participating in recoveries,
+ * until this returns.
+ *
+ * Force has 4 possible values:
+ * 0 - don't destroy locksapce if it has any LKBs
+ * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
+ * 2 - destroy lockspace regardless of LKBs
+ * 3 - destroy lockspace as part of a forced shutdown
+ */
+
+int dlm_release_lockspace(void *lockspace, int force)
+{
+ struct dlm_ls *ls;
+
+ ls = dlm_find_lockspace_local(lockspace);
+ if (!ls)
+ return -EINVAL;
+ dlm_put_lockspace(ls);
+ return release_lockspace(ls, force);
+}
+
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
new file mode 100644
index 000000000000..891eabbdd021
--- /dev/null
+++ b/fs/dlm/lockspace.h
@@ -0,0 +1,25 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOCKSPACE_DOT_H__
+#define __LOCKSPACE_DOT_H__
+
+int dlm_lockspace_init(void);
+void dlm_lockspace_exit(void);
+struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
+struct dlm_ls *dlm_find_lockspace_local(void *id);
+struct dlm_ls *dlm_find_lockspace_device(int minor);
+void dlm_put_lockspace(struct dlm_ls *ls);
+
+#endif /* __LOCKSPACE_DOT_H__ */
+
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
new file mode 100644
index 000000000000..23f5ce12080b
--- /dev/null
+++ b/fs/dlm/lowcomms.c
@@ -0,0 +1,1238 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * lowcomms.c
+ *
+ * This is the "low-level" comms layer.
+ *
+ * It is responsible for sending/receiving messages
+ * from other nodes in the cluster.
+ *
+ * Cluster nodes are referred to by their nodeids. nodeids are
+ * simply 32 bit numbers to the locking module - if they need to
+ * be expanded for the cluster infrastructure then that is it's
+ * responsibility. It is this layer's
+ * responsibility to resolve these into IP address or
+ * whatever it needs for inter-node communication.
+ *
+ * The comms level is two kernel threads that deal mainly with
+ * the receiving of messages from other nodes and passing them
+ * up to the mid-level comms layer (which understands the
+ * message format) for execution by the locking core, and
+ * a send thread which does all the setting up of connections
+ * to remote nodes and the sending of data. Threads are not allowed
+ * to send their own data because it may cause them to wait in times
+ * of high load. Also, this way, the sending thread can collect together
+ * messages bound for one node and send them in one block.
+ *
+ * I don't see any problem with the recv thread executing the locking
+ * code on behalf of remote processes as the locking code is
+ * short, efficient and never (well, hardly ever) waits.
+ *
+ */
+
+#include <asm/ioctls.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/sctp/user.h>
+#include <linux/pagemap.h>
+#include <linux/socket.h>
+#include <linux/idr.h>
+
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "midcomms.h"
+
+static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
+static int dlm_local_count;
+static int dlm_local_nodeid;
+
+/* One of these per connected node */
+
+#define NI_INIT_PENDING 1
+#define NI_WRITE_PENDING 2
+
+struct nodeinfo {
+ spinlock_t lock;
+ sctp_assoc_t assoc_id;
+ unsigned long flags;
+ struct list_head write_list; /* nodes with pending writes */
+ struct list_head writequeue; /* outgoing writequeue_entries */
+ spinlock_t writequeue_lock;
+ int nodeid;
+};
+
+static DEFINE_IDR(nodeinfo_idr);
+static struct rw_semaphore nodeinfo_lock;
+static int max_nodeid;
+
+struct cbuf {
+ unsigned base;
+ unsigned len;
+ unsigned mask;
+};
+
+/* Just the one of these, now. But this struct keeps
+ the connection-specific variables together */
+
+#define CF_READ_PENDING 1
+
+struct connection {
+ struct socket *sock;
+ unsigned long flags;
+ struct page *rx_page;
+ atomic_t waiting_requests;
+ struct cbuf cb;
+ int eagain_flag;
+};
+
+/* An entry waiting to be sent */
+
+struct writequeue_entry {
+ struct list_head list;
+ struct page *page;
+ int offset;
+ int len;
+ int end;
+ int users;
+ struct nodeinfo *ni;
+};
+
+#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
+#define CBUF_EMPTY(cb) ((cb)->len == 0)
+#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
+#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
+
+#define CBUF_INIT(cb, size) \
+do { \
+ (cb)->base = (cb)->len = 0; \
+ (cb)->mask = ((size)-1); \
+} while(0)
+
+#define CBUF_EAT(cb, n) \
+do { \
+ (cb)->len -= (n); \
+ (cb)->base += (n); \
+ (cb)->base &= (cb)->mask; \
+} while(0)
+
+
+/* List of nodes which have writes pending */
+static struct list_head write_nodes;
+static spinlock_t write_nodes_lock;
+
+/* Maximum number of incoming messages to process before
+ * doing a schedule()
+ */
+#define MAX_RX_MSG_COUNT 25
+
+/* Manage daemons */
+static struct task_struct *recv_task;
+static struct task_struct *send_task;
+static wait_queue_head_t lowcomms_recv_wait;
+static atomic_t accepting;
+
+/* The SCTP connection */
+static struct connection sctp_con;
+
+
+static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
+{
+ struct sockaddr_storage addr;
+ int error;
+
+ if (!dlm_local_count)
+ return -1;
+
+ error = dlm_nodeid_to_addr(nodeid, &addr);
+ if (error)
+ return error;
+
+ if (dlm_local_addr[0]->ss_family == AF_INET) {
+ struct sockaddr_in *in4 = (struct sockaddr_in *) &addr;
+ struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
+ ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
+ } else {
+ struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr;
+ struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
+ memcpy(&ret6->sin6_addr, &in6->sin6_addr,
+ sizeof(in6->sin6_addr));
+ }
+
+ return 0;
+}
+
+static struct nodeinfo *nodeid2nodeinfo(int nodeid, int alloc)
+{
+ struct nodeinfo *ni;
+ int r;
+ int n;
+
+ down_read(&nodeinfo_lock);
+ ni = idr_find(&nodeinfo_idr, nodeid);
+ up_read(&nodeinfo_lock);
+
+ if (!ni && alloc) {
+ down_write(&nodeinfo_lock);
+
+ ni = idr_find(&nodeinfo_idr, nodeid);
+ if (ni)
+ goto out_up;
+
+ r = idr_pre_get(&nodeinfo_idr, alloc);
+ if (!r)
+ goto out_up;
+
+ ni = kmalloc(sizeof(struct nodeinfo), alloc);
+ if (!ni)
+ goto out_up;
+
+ r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n);
+ if (r) {
+ kfree(ni);
+ ni = NULL;
+ goto out_up;
+ }
+ if (n != nodeid) {
+ idr_remove(&nodeinfo_idr, n);
+ kfree(ni);
+ ni = NULL;
+ goto out_up;
+ }
+ memset(ni, 0, sizeof(struct nodeinfo));
+ spin_lock_init(&ni->lock);
+ INIT_LIST_HEAD(&ni->writequeue);
+ spin_lock_init(&ni->writequeue_lock);
+ ni->nodeid = nodeid;
+
+ if (nodeid > max_nodeid)
+ max_nodeid = nodeid;
+ out_up:
+ up_write(&nodeinfo_lock);
+ }
+
+ return ni;
+}
+
+/* Don't call this too often... */
+static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc)
+{
+ int i;
+ struct nodeinfo *ni;
+
+ for (i=1; i<=max_nodeid; i++) {
+ ni = nodeid2nodeinfo(i, 0);
+ if (ni && ni->assoc_id == assoc)
+ return ni;
+ }
+ return NULL;
+}
+
+/* Data or notification available on socket */
+static void lowcomms_data_ready(struct sock *sk, int count_unused)
+{
+ atomic_inc(&sctp_con.waiting_requests);
+ if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags))
+ return;
+
+ wake_up_interruptible(&lowcomms_recv_wait);
+}
+
+
+/* Add the port number to an IP6 or 4 sockaddr and return the address length.
+ Also padd out the struct with zeros to make comparisons meaningful */
+
+static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
+ int *addr_len)
+{
+ struct sockaddr_in *local4_addr;
+ struct sockaddr_in6 *local6_addr;
+
+ if (!dlm_local_count)
+ return;
+
+ if (!port) {
+ if (dlm_local_addr[0]->ss_family == AF_INET) {
+ local4_addr = (struct sockaddr_in *)dlm_local_addr[0];
+ port = be16_to_cpu(local4_addr->sin_port);
+ } else {
+ local6_addr = (struct sockaddr_in6 *)dlm_local_addr[0];
+ port = be16_to_cpu(local6_addr->sin6_port);
+ }
+ }
+
+ saddr->ss_family = dlm_local_addr[0]->ss_family;
+ if (dlm_local_addr[0]->ss_family == AF_INET) {
+ struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
+ in4_addr->sin_port = cpu_to_be16(port);
+ memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
+ memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) -
+ sizeof(struct sockaddr_in));
+ *addr_len = sizeof(struct sockaddr_in);
+ } else {
+ struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
+ in6_addr->sin6_port = cpu_to_be16(port);
+ memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) -
+ sizeof(struct sockaddr_in6));
+ *addr_len = sizeof(struct sockaddr_in6);
+ }
+}
+
+/* Close the connection and tidy up */
+static void close_connection(void)
+{
+ if (sctp_con.sock) {
+ sock_release(sctp_con.sock);
+ sctp_con.sock = NULL;
+ }
+
+ if (sctp_con.rx_page) {
+ __free_page(sctp_con.rx_page);
+ sctp_con.rx_page = NULL;
+ }
+}
+
+/* We only send shutdown messages to nodes that are not part of the cluster */
+static void send_shutdown(sctp_assoc_t associd)
+{
+ static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+ struct msghdr outmessage;
+ struct cmsghdr *cmsg;
+ struct sctp_sndrcvinfo *sinfo;
+ int ret;
+
+ outmessage.msg_name = NULL;
+ outmessage.msg_namelen = 0;
+ outmessage.msg_control = outcmsg;
+ outmessage.msg_controllen = sizeof(outcmsg);
+ outmessage.msg_flags = MSG_EOR;
+
+ cmsg = CMSG_FIRSTHDR(&outmessage);
+ cmsg->cmsg_level = IPPROTO_SCTP;
+ cmsg->cmsg_type = SCTP_SNDRCV;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+ outmessage.msg_controllen = cmsg->cmsg_len;
+ sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+ memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+
+ sinfo->sinfo_flags |= MSG_EOF;
+ sinfo->sinfo_assoc_id = associd;
+
+ ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0);
+
+ if (ret != 0)
+ log_print("send EOF to node failed: %d", ret);
+}
+
+
+/* INIT failed but we don't know which node...
+ restart INIT on all pending nodes */
+static void init_failed(void)
+{
+ int i;
+ struct nodeinfo *ni;
+
+ for (i=1; i<=max_nodeid; i++) {
+ ni = nodeid2nodeinfo(i, 0);
+ if (!ni)
+ continue;
+
+ if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
+ ni->assoc_id = 0;
+ if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+ spin_lock_bh(&write_nodes_lock);
+ list_add_tail(&ni->write_list, &write_nodes);
+ spin_unlock_bh(&write_nodes_lock);
+ }
+ }
+ }
+ wake_up_process(send_task);
+}
+
+/* Something happened to an association */
+static void process_sctp_notification(struct msghdr *msg, char *buf)
+{
+ union sctp_notification *sn = (union sctp_notification *)buf;
+
+ if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
+ switch (sn->sn_assoc_change.sac_state) {
+
+ case SCTP_COMM_UP:
+ case SCTP_RESTART:
+ {
+ /* Check that the new node is in the lockspace */
+ struct sctp_prim prim;
+ mm_segment_t fs;
+ int nodeid;
+ int prim_len, ret;
+ int addr_len;
+ struct nodeinfo *ni;
+
+ /* This seems to happen when we received a connection
+ * too early... or something... anyway, it happens but
+ * we always seem to get a real message too, see
+ * receive_from_sock */
+
+ if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
+ log_print("COMM_UP for invalid assoc ID %d",
+ (int)sn->sn_assoc_change.sac_assoc_id);
+ init_failed();
+ return;
+ }
+ memset(&prim, 0, sizeof(struct sctp_prim));
+ prim_len = sizeof(struct sctp_prim);
+ prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
+
+ fs = get_fs();
+ set_fs(get_ds());
+ ret = sctp_con.sock->ops->getsockopt(sctp_con.sock,
+ IPPROTO_SCTP, SCTP_PRIMARY_ADDR,
+ (char*)&prim, &prim_len);
+ set_fs(fs);
+ if (ret < 0) {
+ struct nodeinfo *ni;
+
+ log_print("getsockopt/sctp_primary_addr on "
+ "new assoc %d failed : %d",
+ (int)sn->sn_assoc_change.sac_assoc_id, ret);
+
+ /* Retry INIT later */
+ ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
+ if (ni)
+ clear_bit(NI_INIT_PENDING, &ni->flags);
+ return;
+ }
+ make_sockaddr(&prim.ssp_addr, 0, &addr_len);
+ if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
+ log_print("reject connect from unknown addr");
+ send_shutdown(prim.ssp_assoc_id);
+ return;
+ }
+
+ ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
+ if (!ni)
+ return;
+
+ /* Save the assoc ID */
+ spin_lock(&ni->lock);
+ ni->assoc_id = sn->sn_assoc_change.sac_assoc_id;
+ spin_unlock(&ni->lock);
+
+ log_print("got new/restarted association %d nodeid %d",
+ (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
+
+ /* Send any pending writes */
+ clear_bit(NI_INIT_PENDING, &ni->flags);
+ if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+ spin_lock_bh(&write_nodes_lock);
+ list_add_tail(&ni->write_list, &write_nodes);
+ spin_unlock_bh(&write_nodes_lock);
+ }
+ wake_up_process(send_task);
+ }
+ break;
+
+ case SCTP_COMM_LOST:
+ case SCTP_SHUTDOWN_COMP:
+ {
+ struct nodeinfo *ni;
+
+ ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
+ if (ni) {
+ spin_lock(&ni->lock);
+ ni->assoc_id = 0;
+ spin_unlock(&ni->lock);
+ }
+ }
+ break;
+
+ /* We don't know which INIT failed, so clear the PENDING flags
+ * on them all. if assoc_id is zero then it will then try
+ * again */
+
+ case SCTP_CANT_STR_ASSOC:
+ {
+ log_print("Can't start SCTP association - retrying");
+ init_failed();
+ }
+ break;
+
+ default:
+ log_print("unexpected SCTP assoc change id=%d state=%d",
+ (int)sn->sn_assoc_change.sac_assoc_id,
+ sn->sn_assoc_change.sac_state);
+ }
+ }
+}
+
+/* Data received from remote end */
+static int receive_from_sock(void)
+{
+ int ret = 0;
+ struct msghdr msg;
+ struct kvec iov[2];
+ unsigned len;
+ int r;
+ struct sctp_sndrcvinfo *sinfo;
+ struct cmsghdr *cmsg;
+ struct nodeinfo *ni;
+
+ /* These two are marginally too big for stack allocation, but this
+ * function is (currently) only called by dlm_recvd so static should be
+ * OK.
+ */
+ static struct sockaddr_storage msgname;
+ static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+
+ if (sctp_con.sock == NULL)
+ goto out;
+
+ if (sctp_con.rx_page == NULL) {
+ /*
+ * This doesn't need to be atomic, but I think it should
+ * improve performance if it is.
+ */
+ sctp_con.rx_page = alloc_page(GFP_ATOMIC);
+ if (sctp_con.rx_page == NULL)
+ goto out_resched;
+ CBUF_INIT(&sctp_con.cb, PAGE_CACHE_SIZE);
+ }
+
+ memset(&incmsg, 0, sizeof(incmsg));
+ memset(&msgname, 0, sizeof(msgname));
+
+ memset(incmsg, 0, sizeof(incmsg));
+ msg.msg_name = &msgname;
+ msg.msg_namelen = sizeof(msgname);
+ msg.msg_flags = 0;
+ msg.msg_control = incmsg;
+ msg.msg_controllen = sizeof(incmsg);
+
+ /* I don't see why this circular buffer stuff is necessary for SCTP
+ * which is a packet-based protocol, but the whole thing breaks under
+ * load without it! The overhead is minimal (and is in the TCP lowcomms
+ * anyway, of course) so I'll leave it in until I can figure out what's
+ * really happening.
+ */
+
+ /*
+ * iov[0] is the bit of the circular buffer between the current end
+ * point (cb.base + cb.len) and the end of the buffer.
+ */
+ iov[0].iov_len = sctp_con.cb.base - CBUF_DATA(&sctp_con.cb);
+ iov[0].iov_base = page_address(sctp_con.rx_page) +
+ CBUF_DATA(&sctp_con.cb);
+ iov[1].iov_len = 0;
+
+ /*
+ * iov[1] is the bit of the circular buffer between the start of the
+ * buffer and the start of the currently used section (cb.base)
+ */
+ if (CBUF_DATA(&sctp_con.cb) >= sctp_con.cb.base) {
+ iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&sctp_con.cb);
+ iov[1].iov_len = sctp_con.cb.base;
+ iov[1].iov_base = page_address(sctp_con.rx_page);
+ msg.msg_iovlen = 2;
+ }
+ len = iov[0].iov_len + iov[1].iov_len;
+
+ r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, 1, len,
+ MSG_NOSIGNAL | MSG_DONTWAIT);
+ if (ret <= 0)
+ goto out_close;
+
+ msg.msg_control = incmsg;
+ msg.msg_controllen = sizeof(incmsg);
+ cmsg = CMSG_FIRSTHDR(&msg);
+ sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+
+ if (msg.msg_flags & MSG_NOTIFICATION) {
+ process_sctp_notification(&msg, page_address(sctp_con.rx_page));
+ return 0;
+ }
+
+ /* Is this a new association ? */
+ ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL);
+ if (ni) {
+ ni->assoc_id = sinfo->sinfo_assoc_id;
+ if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
+
+ if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+ spin_lock_bh(&write_nodes_lock);
+ list_add_tail(&ni->write_list, &write_nodes);
+ spin_unlock_bh(&write_nodes_lock);
+ }
+ wake_up_process(send_task);
+ }
+ }
+
+ /* INIT sends a message with length of 1 - ignore it */
+ if (r == 1)
+ return 0;
+
+ CBUF_ADD(&sctp_con.cb, ret);
+ ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
+ page_address(sctp_con.rx_page),
+ sctp_con.cb.base, sctp_con.cb.len,
+ PAGE_CACHE_SIZE);
+ if (ret < 0)
+ goto out_close;
+ CBUF_EAT(&sctp_con.cb, ret);
+
+ out:
+ ret = 0;
+ goto out_ret;
+
+ out_resched:
+ lowcomms_data_ready(sctp_con.sock->sk, 0);
+ ret = 0;
+ schedule();
+ goto out_ret;
+
+ out_close:
+ if (ret != -EAGAIN)
+ log_print("error reading from sctp socket: %d", ret);
+ out_ret:
+ return ret;
+}
+
+/* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */
+static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
+{
+ mm_segment_t fs;
+ int result = 0;
+
+ fs = get_fs();
+ set_fs(get_ds());
+ if (num == 1)
+ result = sctp_con.sock->ops->bind(sctp_con.sock,
+ (struct sockaddr *) addr, addr_len);
+ else
+ result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP,
+ SCTP_SOCKOPT_BINDX_ADD, (char *)addr, addr_len);
+ set_fs(fs);
+
+ if (result < 0)
+ log_print("Can't bind to port %d addr number %d",
+ dlm_config.tcp_port, num);
+
+ return result;
+}
+
+static void init_local(void)
+{
+ struct sockaddr_storage sas, *addr;
+ int i;
+
+ dlm_local_nodeid = dlm_our_nodeid();
+
+ for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
+ if (dlm_our_addr(&sas, i))
+ break;
+
+ addr = kmalloc(sizeof(*addr), GFP_KERNEL);
+ if (!addr)
+ break;
+ memcpy(addr, &sas, sizeof(*addr));
+ dlm_local_addr[dlm_local_count++] = addr;
+ }
+}
+
+/* Initialise SCTP socket and bind to all interfaces */
+static int init_sock(void)
+{
+ mm_segment_t fs;
+ struct socket *sock = NULL;
+ struct sockaddr_storage localaddr;
+ struct sctp_event_subscribe subscribe;
+ int result = -EINVAL, num = 1, i, addr_len;
+
+ if (!dlm_local_count) {
+ init_local();
+ if (!dlm_local_count) {
+ log_print("no local IP address has been set");
+ goto out;
+ }
+ }
+
+ result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
+ IPPROTO_SCTP, &sock);
+ if (result < 0) {
+ log_print("Can't create comms socket, check SCTP is loaded");
+ goto out;
+ }
+
+ /* Listen for events */
+ memset(&subscribe, 0, sizeof(subscribe));
+ subscribe.sctp_data_io_event = 1;
+ subscribe.sctp_association_event = 1;
+ subscribe.sctp_send_failure_event = 1;
+ subscribe.sctp_shutdown_event = 1;
+ subscribe.sctp_partial_delivery_event = 1;
+
+ fs = get_fs();
+ set_fs(get_ds());
+ result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
+ (char *)&subscribe, sizeof(subscribe));
+ set_fs(fs);
+
+ if (result < 0) {
+ log_print("Failed to set SCTP_EVENTS on socket: result=%d",
+ result);
+ goto create_delsock;
+ }
+
+ /* Init con struct */
+ sock->sk->sk_user_data = &sctp_con;
+ sctp_con.sock = sock;
+ sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready;
+
+ /* Bind to all interfaces. */
+ for (i = 0; i < dlm_local_count; i++) {
+ memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
+ make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len);
+
+ result = add_bind_addr(&localaddr, addr_len, num);
+ if (result)
+ goto create_delsock;
+ ++num;
+ }
+
+ result = sock->ops->listen(sock, 5);
+ if (result < 0) {
+ log_print("Can't set socket listening");
+ goto create_delsock;
+ }
+
+ return 0;
+
+ create_delsock:
+ sock_release(sock);
+ sctp_con.sock = NULL;
+ out:
+ return result;
+}
+
+
+static struct writequeue_entry *new_writequeue_entry(int allocation)
+{
+ struct writequeue_entry *entry;
+
+ entry = kmalloc(sizeof(struct writequeue_entry), allocation);
+ if (!entry)
+ return NULL;
+
+ entry->page = alloc_page(allocation);
+ if (!entry->page) {
+ kfree(entry);
+ return NULL;
+ }
+
+ entry->offset = 0;
+ entry->len = 0;
+ entry->end = 0;
+ entry->users = 0;
+
+ return entry;
+}
+
+void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc)
+{
+ struct writequeue_entry *e;
+ int offset = 0;
+ int users = 0;
+ struct nodeinfo *ni;
+
+ if (!atomic_read(&accepting))
+ return NULL;
+
+ ni = nodeid2nodeinfo(nodeid, allocation);
+ if (!ni)
+ return NULL;
+
+ spin_lock(&ni->writequeue_lock);
+ e = list_entry(ni->writequeue.prev, struct writequeue_entry, list);
+ if (((struct list_head *) e == &ni->writequeue) ||
+ (PAGE_CACHE_SIZE - e->end < len)) {
+ e = NULL;
+ } else {
+ offset = e->end;
+ e->end += len;
+ users = e->users++;
+ }
+ spin_unlock(&ni->writequeue_lock);
+
+ if (e) {
+ got_one:
+ if (users == 0)
+ kmap(e->page);
+ *ppc = page_address(e->page) + offset;
+ return e;
+ }
+
+ e = new_writequeue_entry(allocation);
+ if (e) {
+ spin_lock(&ni->writequeue_lock);
+ offset = e->end;
+ e->end += len;
+ e->ni = ni;
+ users = e->users++;
+ list_add_tail(&e->list, &ni->writequeue);
+ spin_unlock(&ni->writequeue_lock);
+ goto got_one;
+ }
+ return NULL;
+}
+
+void dlm_lowcomms_commit_buffer(void *arg)
+{
+ struct writequeue_entry *e = (struct writequeue_entry *) arg;
+ int users;
+ struct nodeinfo *ni = e->ni;
+
+ if (!atomic_read(&accepting))
+ return;
+
+ spin_lock(&ni->writequeue_lock);
+ users = --e->users;
+ if (users)
+ goto out;
+ e->len = e->end - e->offset;
+ kunmap(e->page);
+ spin_unlock(&ni->writequeue_lock);
+
+ if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+ spin_lock_bh(&write_nodes_lock);
+ list_add_tail(&ni->write_list, &write_nodes);
+ spin_unlock_bh(&write_nodes_lock);
+ wake_up_process(send_task);
+ }
+ return;
+
+ out:
+ spin_unlock(&ni->writequeue_lock);
+ return;
+}
+
+static void free_entry(struct writequeue_entry *e)
+{
+ __free_page(e->page);
+ kfree(e);
+}
+
+/* Initiate an SCTP association. In theory we could just use sendmsg() on
+ the first IP address and it should work, but this allows us to set up the
+ association before sending any valuable data that we can't afford to lose.
+ It also keeps the send path clean as it can now always use the association ID */
+static void initiate_association(int nodeid)
+{
+ struct sockaddr_storage rem_addr;
+ static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+ struct msghdr outmessage;
+ struct cmsghdr *cmsg;
+ struct sctp_sndrcvinfo *sinfo;
+ int ret;
+ int addrlen;
+ char buf[1];
+ struct kvec iov[1];
+ struct nodeinfo *ni;
+
+ log_print("Initiating association with node %d", nodeid);
+
+ ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
+ if (!ni)
+ return;
+
+ if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) {
+ log_print("no address for nodeid %d", nodeid);
+ return;
+ }
+
+ make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen);
+
+ outmessage.msg_name = &rem_addr;
+ outmessage.msg_namelen = addrlen;
+ outmessage.msg_control = outcmsg;
+ outmessage.msg_controllen = sizeof(outcmsg);
+ outmessage.msg_flags = MSG_EOR;
+
+ iov[0].iov_base = buf;
+ iov[0].iov_len = 1;
+
+ /* Real INIT messages seem to cause trouble. Just send a 1 byte message
+ we can afford to lose */
+ cmsg = CMSG_FIRSTHDR(&outmessage);
+ cmsg->cmsg_level = IPPROTO_SCTP;
+ cmsg->cmsg_type = SCTP_SNDRCV;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+ sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+ memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+ sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
+
+ outmessage.msg_controllen = cmsg->cmsg_len;
+ ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1);
+ if (ret < 0) {
+ log_print("send INIT to node failed: %d", ret);
+ /* Try again later */
+ clear_bit(NI_INIT_PENDING, &ni->flags);
+ }
+}
+
+/* Send a message */
+static int send_to_sock(struct nodeinfo *ni)
+{
+ int ret = 0;
+ struct writequeue_entry *e;
+ int len, offset;
+ struct msghdr outmsg;
+ static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+ struct cmsghdr *cmsg;
+ struct sctp_sndrcvinfo *sinfo;
+ struct kvec iov;
+
+ /* See if we need to init an association before we start
+ sending precious messages */
+ spin_lock(&ni->lock);
+ if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
+ spin_unlock(&ni->lock);
+ initiate_association(ni->nodeid);
+ return 0;
+ }
+ spin_unlock(&ni->lock);
+
+ outmsg.msg_name = NULL; /* We use assoc_id */
+ outmsg.msg_namelen = 0;
+ outmsg.msg_control = outcmsg;
+ outmsg.msg_controllen = sizeof(outcmsg);
+ outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR;
+
+ cmsg = CMSG_FIRSTHDR(&outmsg);
+ cmsg->cmsg_level = IPPROTO_SCTP;
+ cmsg->cmsg_type = SCTP_SNDRCV;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+ sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+ memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+ sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
+ sinfo->sinfo_assoc_id = ni->assoc_id;
+ outmsg.msg_controllen = cmsg->cmsg_len;
+
+ spin_lock(&ni->writequeue_lock);
+ for (;;) {
+ if (list_empty(&ni->writequeue))
+ break;
+ e = list_entry(ni->writequeue.next, struct writequeue_entry,
+ list);
+ len = e->len;
+ offset = e->offset;
+ BUG_ON(len == 0 && e->users == 0);
+ spin_unlock(&ni->writequeue_lock);
+ kmap(e->page);
+
+ ret = 0;
+ if (len) {
+ iov.iov_base = page_address(e->page)+offset;
+ iov.iov_len = len;
+
+ ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1,
+ len);
+ if (ret == -EAGAIN) {
+ sctp_con.eagain_flag = 1;
+ goto out;
+ } else if (ret < 0)
+ goto send_error;
+ } else {
+ /* Don't starve people filling buffers */
+ schedule();
+ }
+
+ spin_lock(&ni->writequeue_lock);
+ e->offset += ret;
+ e->len -= ret;
+
+ if (e->len == 0 && e->users == 0) {
+ list_del(&e->list);
+ free_entry(e);
+ continue;
+ }
+ }
+ spin_unlock(&ni->writequeue_lock);
+ out:
+ return ret;
+
+ send_error:
+ log_print("Error sending to node %d %d", ni->nodeid, ret);
+ spin_lock(&ni->lock);
+ if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
+ ni->assoc_id = 0;
+ spin_unlock(&ni->lock);
+ initiate_association(ni->nodeid);
+ } else
+ spin_unlock(&ni->lock);
+
+ return ret;
+}
+
+/* Try to send any messages that are pending */
+static void process_output_queue(void)
+{
+ struct list_head *list;
+ struct list_head *temp;
+
+ spin_lock_bh(&write_nodes_lock);
+ list_for_each_safe(list, temp, &write_nodes) {
+ struct nodeinfo *ni =
+ list_entry(list, struct nodeinfo, write_list);
+ clear_bit(NI_WRITE_PENDING, &ni->flags);
+ list_del(&ni->write_list);
+
+ spin_unlock_bh(&write_nodes_lock);
+
+ send_to_sock(ni);
+ spin_lock_bh(&write_nodes_lock);
+ }
+ spin_unlock_bh(&write_nodes_lock);
+}
+
+/* Called after we've had -EAGAIN and been woken up */
+static void refill_write_queue(void)
+{
+ int i;
+
+ for (i=1; i<=max_nodeid; i++) {
+ struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
+
+ if (ni) {
+ if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+ spin_lock_bh(&write_nodes_lock);
+ list_add_tail(&ni->write_list, &write_nodes);
+ spin_unlock_bh(&write_nodes_lock);
+ }
+ }
+ }
+}
+
+static void clean_one_writequeue(struct nodeinfo *ni)
+{
+ struct list_head *list;
+ struct list_head *temp;
+
+ spin_lock(&ni->writequeue_lock);
+ list_for_each_safe(list, temp, &ni->writequeue) {
+ struct writequeue_entry *e =
+ list_entry(list, struct writequeue_entry, list);
+ list_del(&e->list);
+ free_entry(e);
+ }
+ spin_unlock(&ni->writequeue_lock);
+}
+
+static void clean_writequeues(void)
+{
+ int i;
+
+ for (i=1; i<=max_nodeid; i++) {
+ struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
+ if (ni)
+ clean_one_writequeue(ni);
+ }
+}
+
+
+static void dealloc_nodeinfo(void)
+{
+ int i;
+
+ for (i=1; i<=max_nodeid; i++) {
+ struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
+ if (ni) {
+ idr_remove(&nodeinfo_idr, i);
+ kfree(ni);
+ }
+ }
+}
+
+int dlm_lowcomms_close(int nodeid)
+{
+ struct nodeinfo *ni;
+
+ ni = nodeid2nodeinfo(nodeid, 0);
+ if (!ni)
+ return -1;
+
+ spin_lock(&ni->lock);
+ if (ni->assoc_id) {
+ ni->assoc_id = 0;
+ /* Don't send shutdown here, sctp will just queue it
+ till the node comes back up! */
+ }
+ spin_unlock(&ni->lock);
+
+ clean_one_writequeue(ni);
+ clear_bit(NI_INIT_PENDING, &ni->flags);
+ return 0;
+}
+
+static int write_list_empty(void)
+{
+ int status;
+
+ spin_lock_bh(&write_nodes_lock);
+ status = list_empty(&write_nodes);
+ spin_unlock_bh(&write_nodes_lock);
+
+ return status;
+}
+
+static int dlm_recvd(void *data)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ while (!kthread_should_stop()) {
+ int count = 0;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&lowcomms_recv_wait, &wait);
+ if (!test_bit(CF_READ_PENDING, &sctp_con.flags))
+ schedule();
+ remove_wait_queue(&lowcomms_recv_wait, &wait);
+ set_current_state(TASK_RUNNING);
+
+ if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
+ int ret;
+
+ do {
+ ret = receive_from_sock();
+
+ /* Don't starve out everyone else */
+ if (++count >= MAX_RX_MSG_COUNT) {
+ schedule();
+ count = 0;
+ }
+ } while (!kthread_should_stop() && ret >=0);
+ }
+ schedule();
+ }
+
+ return 0;
+}
+
+static int dlm_sendd(void *data)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
+
+ while (!kthread_should_stop()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (write_list_empty())
+ schedule();
+ set_current_state(TASK_RUNNING);
+
+ if (sctp_con.eagain_flag) {
+ sctp_con.eagain_flag = 0;
+ refill_write_queue();
+ }
+ process_output_queue();
+ }
+
+ remove_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
+
+ return 0;
+}
+
+static void daemons_stop(void)
+{
+ kthread_stop(recv_task);
+ kthread_stop(send_task);
+}
+
+static int daemons_start(void)
+{
+ struct task_struct *p;
+ int error;
+
+ p = kthread_run(dlm_recvd, NULL, "dlm_recvd");
+ error = IS_ERR(p);
+ if (error) {
+ log_print("can't start dlm_recvd %d", error);
+ return error;
+ }
+ recv_task = p;
+
+ p = kthread_run(dlm_sendd, NULL, "dlm_sendd");
+ error = IS_ERR(p);
+ if (error) {
+ log_print("can't start dlm_sendd %d", error);
+ kthread_stop(recv_task);
+ return error;
+ }
+ send_task = p;
+
+ return 0;
+}
+
+/*
+ * This is quite likely to sleep...
+ */
+int dlm_lowcomms_start(void)
+{
+ int error;
+
+ error = init_sock();
+ if (error)
+ goto fail_sock;
+ error = daemons_start();
+ if (error)
+ goto fail_sock;
+ atomic_set(&accepting, 1);
+ return 0;
+
+ fail_sock:
+ close_connection();
+ return error;
+}
+
+/* Set all the activity flags to prevent any socket activity. */
+
+void dlm_lowcomms_stop(void)
+{
+ atomic_set(&accepting, 0);
+ sctp_con.flags = 0x7;
+ daemons_stop();
+ clean_writequeues();
+ close_connection();
+ dealloc_nodeinfo();
+ max_nodeid = 0;
+}
+
+int dlm_lowcomms_init(void)
+{
+ init_waitqueue_head(&lowcomms_recv_wait);
+ spin_lock_init(&write_nodes_lock);
+ INIT_LIST_HEAD(&write_nodes);
+ init_rwsem(&nodeinfo_lock);
+ return 0;
+}
+
+void dlm_lowcomms_exit(void)
+{
+ int i;
+
+ for (i = 0; i < dlm_local_count; i++)
+ kfree(dlm_local_addr[i]);
+ dlm_local_count = 0;
+ dlm_local_nodeid = 0;
+}
+
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
new file mode 100644
index 000000000000..6c04bb09cfa8
--- /dev/null
+++ b/fs/dlm/lowcomms.h
@@ -0,0 +1,26 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOWCOMMS_DOT_H__
+#define __LOWCOMMS_DOT_H__
+
+int dlm_lowcomms_init(void);
+void dlm_lowcomms_exit(void);
+int dlm_lowcomms_start(void);
+void dlm_lowcomms_stop(void);
+int dlm_lowcomms_close(int nodeid);
+void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc);
+void dlm_lowcomms_commit_buffer(void *mh);
+
+#endif /* __LOWCOMMS_DOT_H__ */
+
diff --git a/fs/dlm/lvb_table.h b/fs/dlm/lvb_table.h
new file mode 100644
index 000000000000..cc3e92f3feef
--- /dev/null
+++ b/fs/dlm/lvb_table.h
@@ -0,0 +1,18 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LVB_TABLE_DOT_H__
+#define __LVB_TABLE_DOT_H__
+
+extern const int dlm_lvb_operations[8][8];
+
+#endif
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
new file mode 100644
index 000000000000..a8da8dc36b2e
--- /dev/null
+++ b/fs/dlm/main.c
@@ -0,0 +1,97 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "lock.h"
+#include "user.h"
+#include "memory.h"
+#include "lowcomms.h"
+#include "config.h"
+
+#ifdef CONFIG_DLM_DEBUG
+int dlm_register_debugfs(void);
+void dlm_unregister_debugfs(void);
+#else
+static inline int dlm_register_debugfs(void) { return 0; }
+static inline void dlm_unregister_debugfs(void) { }
+#endif
+
+static int __init init_dlm(void)
+{
+ int error;
+
+ error = dlm_memory_init();
+ if (error)
+ goto out;
+
+ error = dlm_lockspace_init();
+ if (error)
+ goto out_mem;
+
+ error = dlm_config_init();
+ if (error)
+ goto out_lockspace;
+
+ error = dlm_register_debugfs();
+ if (error)
+ goto out_config;
+
+ error = dlm_lowcomms_init();
+ if (error)
+ goto out_debug;
+
+ error = dlm_user_init();
+ if (error)
+ goto out_lowcomms;
+
+ printk("DLM (built %s %s) installed\n", __DATE__, __TIME__);
+
+ return 0;
+
+ out_lowcomms:
+ dlm_lowcomms_exit();
+ out_debug:
+ dlm_unregister_debugfs();
+ out_config:
+ dlm_config_exit();
+ out_lockspace:
+ dlm_lockspace_exit();
+ out_mem:
+ dlm_memory_exit();
+ out:
+ return error;
+}
+
+static void __exit exit_dlm(void)
+{
+ dlm_user_exit();
+ dlm_lowcomms_exit();
+ dlm_config_exit();
+ dlm_memory_exit();
+ dlm_lockspace_exit();
+ dlm_unregister_debugfs();
+}
+
+module_init(init_dlm);
+module_exit(exit_dlm);
+
+MODULE_DESCRIPTION("Distributed Lock Manager");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+EXPORT_SYMBOL_GPL(dlm_new_lockspace);
+EXPORT_SYMBOL_GPL(dlm_release_lockspace);
+EXPORT_SYMBOL_GPL(dlm_lock);
+EXPORT_SYMBOL_GPL(dlm_unlock);
+
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
new file mode 100644
index 000000000000..a3f7de7f3a8f
--- /dev/null
+++ b/fs/dlm/member.c
@@ -0,0 +1,327 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "recoverd.h"
+#include "recover.h"
+#include "rcom.h"
+#include "config.h"
+
+/*
+ * Following called by dlm_recoverd thread
+ */
+
+static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
+{
+ struct dlm_member *memb = NULL;
+ struct list_head *tmp;
+ struct list_head *newlist = &new->list;
+ struct list_head *head = &ls->ls_nodes;
+
+ list_for_each(tmp, head) {
+ memb = list_entry(tmp, struct dlm_member, list);
+ if (new->nodeid < memb->nodeid)
+ break;
+ }
+
+ if (!memb)
+ list_add_tail(newlist, head);
+ else {
+ /* FIXME: can use list macro here */
+ newlist->prev = tmp->prev;
+ newlist->next = tmp;
+ tmp->prev->next = newlist;
+ tmp->prev = newlist;
+ }
+}
+
+static int dlm_add_member(struct dlm_ls *ls, int nodeid)
+{
+ struct dlm_member *memb;
+ int w;
+
+ memb = kzalloc(sizeof(struct dlm_member), GFP_KERNEL);
+ if (!memb)
+ return -ENOMEM;
+
+ w = dlm_node_weight(ls->ls_name, nodeid);
+ if (w < 0)
+ return w;
+
+ memb->nodeid = nodeid;
+ memb->weight = w;
+ add_ordered_member(ls, memb);
+ ls->ls_num_nodes++;
+ return 0;
+}
+
+static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
+{
+ list_move(&memb->list, &ls->ls_nodes_gone);
+ ls->ls_num_nodes--;
+}
+
+static int dlm_is_member(struct dlm_ls *ls, int nodeid)
+{
+ struct dlm_member *memb;
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (memb->nodeid == nodeid)
+ return 1;
+ }
+ return 0;
+}
+
+int dlm_is_removed(struct dlm_ls *ls, int nodeid)
+{
+ struct dlm_member *memb;
+
+ list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
+ if (memb->nodeid == nodeid)
+ return 1;
+ }
+ return 0;
+}
+
+static void clear_memb_list(struct list_head *head)
+{
+ struct dlm_member *memb;
+
+ while (!list_empty(head)) {
+ memb = list_entry(head->next, struct dlm_member, list);
+ list_del(&memb->list);
+ kfree(memb);
+ }
+}
+
+void dlm_clear_members(struct dlm_ls *ls)
+{
+ clear_memb_list(&ls->ls_nodes);
+ ls->ls_num_nodes = 0;
+}
+
+void dlm_clear_members_gone(struct dlm_ls *ls)
+{
+ clear_memb_list(&ls->ls_nodes_gone);
+}
+
+static void make_member_array(struct dlm_ls *ls)
+{
+ struct dlm_member *memb;
+ int i, w, x = 0, total = 0, all_zero = 0, *array;
+
+ kfree(ls->ls_node_array);
+ ls->ls_node_array = NULL;
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (memb->weight)
+ total += memb->weight;
+ }
+
+ /* all nodes revert to weight of 1 if all have weight 0 */
+
+ if (!total) {
+ total = ls->ls_num_nodes;
+ all_zero = 1;
+ }
+
+ ls->ls_total_weight = total;
+
+ array = kmalloc(sizeof(int) * total, GFP_KERNEL);
+ if (!array)
+ return;
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (!all_zero && !memb->weight)
+ continue;
+
+ if (all_zero)
+ w = 1;
+ else
+ w = memb->weight;
+
+ DLM_ASSERT(x < total, printk("total %d x %d\n", total, x););
+
+ for (i = 0; i < w; i++)
+ array[x++] = memb->nodeid;
+ }
+
+ ls->ls_node_array = array;
+}
+
+/* send a status request to all members just to establish comms connections */
+
+static int ping_members(struct dlm_ls *ls)
+{
+ struct dlm_member *memb;
+ int error = 0;
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ error = dlm_recovery_stopped(ls);
+ if (error)
+ break;
+ error = dlm_rcom_status(ls, memb->nodeid);
+ if (error)
+ break;
+ }
+ if (error)
+ log_debug(ls, "ping_members aborted %d last nodeid %d",
+ error, ls->ls_recover_nodeid);
+ return error;
+}
+
+int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
+{
+ struct dlm_member *memb, *safe;
+ int i, error, found, pos = 0, neg = 0, low = -1;
+
+ /* move departed members from ls_nodes to ls_nodes_gone */
+
+ list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
+ found = 0;
+ for (i = 0; i < rv->node_count; i++) {
+ if (memb->nodeid == rv->nodeids[i]) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found) {
+ neg++;
+ dlm_remove_member(ls, memb);
+ log_debug(ls, "remove member %d", memb->nodeid);
+ }
+ }
+
+ /* add new members to ls_nodes */
+
+ for (i = 0; i < rv->node_count; i++) {
+ if (dlm_is_member(ls, rv->nodeids[i]))
+ continue;
+ dlm_add_member(ls, rv->nodeids[i]);
+ pos++;
+ log_debug(ls, "add member %d", rv->nodeids[i]);
+ }
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (low == -1 || memb->nodeid < low)
+ low = memb->nodeid;
+ }
+ ls->ls_low_nodeid = low;
+
+ make_member_array(ls);
+ dlm_set_recover_status(ls, DLM_RS_NODES);
+ *neg_out = neg;
+
+ error = ping_members(ls);
+ if (error)
+ goto out;
+
+ error = dlm_recover_members_wait(ls);
+ out:
+ log_debug(ls, "total members %d error %d", ls->ls_num_nodes, error);
+ return error;
+}
+
+/*
+ * Following called from lockspace.c
+ */
+
+int dlm_ls_stop(struct dlm_ls *ls)
+{
+ int new;
+
+ /*
+ * A stop cancels any recovery that's in progress (see RECOVERY_STOP,
+ * dlm_recovery_stopped()) and prevents any new locks from being
+ * processed (see RUNNING, dlm_locking_stopped()).
+ */
+
+ spin_lock(&ls->ls_recover_lock);
+ set_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+ new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags);
+ ls->ls_recover_seq++;
+ spin_unlock(&ls->ls_recover_lock);
+
+ /*
+ * This in_recovery lock does two things:
+ *
+ * 1) Keeps this function from returning until all threads are out
+ * of locking routines and locking is truely stopped.
+ * 2) Keeps any new requests from being processed until it's unlocked
+ * when recovery is complete.
+ */
+
+ if (new)
+ down_write(&ls->ls_in_recovery);
+
+ /*
+ * The recoverd suspend/resume makes sure that dlm_recoverd (if
+ * running) has noticed the clearing of RUNNING above and quit
+ * processing the previous recovery. This will be true for all nodes
+ * before any nodes start the new recovery.
+ */
+
+ dlm_recoverd_suspend(ls);
+ ls->ls_recover_status = 0;
+ dlm_recoverd_resume(ls);
+ return 0;
+}
+
+int dlm_ls_start(struct dlm_ls *ls)
+{
+ struct dlm_recover *rv = NULL, *rv_old;
+ int *ids = NULL;
+ int error, count;
+
+ rv = kzalloc(sizeof(struct dlm_recover), GFP_KERNEL);
+ if (!rv)
+ return -ENOMEM;
+
+ error = count = dlm_nodeid_list(ls->ls_name, &ids);
+ if (error <= 0)
+ goto fail;
+
+ spin_lock(&ls->ls_recover_lock);
+
+ /* the lockspace needs to be stopped before it can be started */
+
+ if (!dlm_locking_stopped(ls)) {
+ spin_unlock(&ls->ls_recover_lock);
+ log_error(ls, "start ignored: lockspace running");
+ error = -EINVAL;
+ goto fail;
+ }
+
+ rv->nodeids = ids;
+ rv->node_count = count;
+ rv->seq = ++ls->ls_recover_seq;
+ rv_old = ls->ls_recover_args;
+ ls->ls_recover_args = rv;
+ spin_unlock(&ls->ls_recover_lock);
+
+ if (rv_old) {
+ kfree(rv_old->nodeids);
+ kfree(rv_old);
+ }
+
+ dlm_recoverd_kick(ls);
+ return 0;
+
+ fail:
+ kfree(rv);
+ kfree(ids);
+ return error;
+}
+
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
new file mode 100644
index 000000000000..927c08c19214
--- /dev/null
+++ b/fs/dlm/member.h
@@ -0,0 +1,24 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __MEMBER_DOT_H__
+#define __MEMBER_DOT_H__
+
+int dlm_ls_stop(struct dlm_ls *ls);
+int dlm_ls_start(struct dlm_ls *ls);
+void dlm_clear_members(struct dlm_ls *ls);
+void dlm_clear_members_gone(struct dlm_ls *ls);
+int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
+int dlm_is_removed(struct dlm_ls *ls, int nodeid);
+
+#endif /* __MEMBER_DOT_H__ */
+
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
new file mode 100644
index 000000000000..989b608fd836
--- /dev/null
+++ b/fs/dlm/memory.c
@@ -0,0 +1,116 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "config.h"
+#include "memory.h"
+
+static kmem_cache_t *lkb_cache;
+
+
+int dlm_memory_init(void)
+{
+ int ret = 0;
+
+ lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
+ __alignof__(struct dlm_lkb), 0, NULL, NULL);
+ if (!lkb_cache)
+ ret = -ENOMEM;
+ return ret;
+}
+
+void dlm_memory_exit(void)
+{
+ if (lkb_cache)
+ kmem_cache_destroy(lkb_cache);
+}
+
+char *allocate_lvb(struct dlm_ls *ls)
+{
+ char *p;
+
+ p = kmalloc(ls->ls_lvblen, GFP_KERNEL);
+ if (p)
+ memset(p, 0, ls->ls_lvblen);
+ return p;
+}
+
+void free_lvb(char *p)
+{
+ kfree(p);
+}
+
+/* FIXME: have some minimal space built-in to rsb for the name and
+ kmalloc a separate name if needed, like dentries are done */
+
+struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
+{
+ struct dlm_rsb *r;
+
+ DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
+
+ r = kmalloc(sizeof(*r) + namelen, GFP_KERNEL);
+ if (r)
+ memset(r, 0, sizeof(*r) + namelen);
+ return r;
+}
+
+void free_rsb(struct dlm_rsb *r)
+{
+ if (r->res_lvbptr)
+ free_lvb(r->res_lvbptr);
+ kfree(r);
+}
+
+struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
+{
+ struct dlm_lkb *lkb;
+
+ lkb = kmem_cache_alloc(lkb_cache, GFP_KERNEL);
+ if (lkb)
+ memset(lkb, 0, sizeof(*lkb));
+ return lkb;
+}
+
+void free_lkb(struct dlm_lkb *lkb)
+{
+ if (lkb->lkb_flags & DLM_IFL_USER) {
+ struct dlm_user_args *ua;
+ ua = (struct dlm_user_args *)lkb->lkb_astparam;
+ if (ua) {
+ if (ua->lksb.sb_lvbptr)
+ kfree(ua->lksb.sb_lvbptr);
+ kfree(ua);
+ }
+ }
+ kmem_cache_free(lkb_cache, lkb);
+}
+
+struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
+{
+ struct dlm_direntry *de;
+
+ DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,
+ printk("namelen = %d\n", namelen););
+
+ de = kmalloc(sizeof(*de) + namelen, GFP_KERNEL);
+ if (de)
+ memset(de, 0, sizeof(*de) + namelen);
+ return de;
+}
+
+void free_direntry(struct dlm_direntry *de)
+{
+ kfree(de);
+}
+
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
new file mode 100644
index 000000000000..6ead158ccc5c
--- /dev/null
+++ b/fs/dlm/memory.h
@@ -0,0 +1,29 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __MEMORY_DOT_H__
+#define __MEMORY_DOT_H__
+
+int dlm_memory_init(void);
+void dlm_memory_exit(void);
+struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
+void free_rsb(struct dlm_rsb *r);
+struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
+void free_lkb(struct dlm_lkb *l);
+struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen);
+void free_direntry(struct dlm_direntry *de);
+char *allocate_lvb(struct dlm_ls *ls);
+void free_lvb(char *l);
+
+#endif /* __MEMORY_DOT_H__ */
+
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
new file mode 100644
index 000000000000..c9b1c3d535f4
--- /dev/null
+++ b/fs/dlm/midcomms.c
@@ -0,0 +1,140 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * midcomms.c
+ *
+ * This is the appallingly named "mid-level" comms layer.
+ *
+ * Its purpose is to take packets from the "real" comms layer,
+ * split them up into packets and pass them to the interested
+ * part of the locking mechanism.
+ *
+ * It also takes messages from the locking layer, formats them
+ * into packets and sends them to the comms layer.
+ */
+
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "rcom.h"
+#include "lock.h"
+#include "midcomms.h"
+
+
+static void copy_from_cb(void *dst, const void *base, unsigned offset,
+ unsigned len, unsigned limit)
+{
+ unsigned copy = len;
+
+ if ((copy + offset) > limit)
+ copy = limit - offset;
+ memcpy(dst, base + offset, copy);
+ len -= copy;
+ if (len)
+ memcpy(dst + copy, base, len);
+}
+
+/*
+ * Called from the low-level comms layer to process a buffer of
+ * commands.
+ *
+ * Only complete messages are processed here, any "spare" bytes from
+ * the end of a buffer are saved and tacked onto the front of the next
+ * message that comes in. I doubt this will happen very often but we
+ * need to be able to cope with it and I don't want the task to be waiting
+ * for packets to come in when there is useful work to be done.
+ */
+
+int dlm_process_incoming_buffer(int nodeid, const void *base,
+ unsigned offset, unsigned len, unsigned limit)
+{
+ unsigned char __tmp[DLM_INBUF_LEN];
+ struct dlm_header *msg = (struct dlm_header *) __tmp;
+ int ret = 0;
+ int err = 0;
+ uint16_t msglen;
+ uint32_t lockspace;
+
+ while (len > sizeof(struct dlm_header)) {
+
+ /* Copy just the header to check the total length. The
+ message may wrap around the end of the buffer back to the
+ start, so we need to use a temp buffer and copy_from_cb. */
+
+ copy_from_cb(msg, base, offset, sizeof(struct dlm_header),
+ limit);
+
+ msglen = le16_to_cpu(msg->h_length);
+ lockspace = msg->h_lockspace;
+
+ err = -EINVAL;
+ if (msglen < sizeof(struct dlm_header))
+ break;
+ err = -E2BIG;
+ if (msglen > dlm_config.buffer_size) {
+ log_print("message size %d from %d too big, buf len %d",
+ msglen, nodeid, len);
+ break;
+ }
+ err = 0;
+
+ /* If only part of the full message is contained in this
+ buffer, then do nothing and wait for lowcomms to call
+ us again later with more data. We return 0 meaning
+ we've consumed none of the input buffer. */
+
+ if (msglen > len)
+ break;
+
+ /* Allocate a larger temp buffer if the full message won't fit
+ in the buffer on the stack (which should work for most
+ ordinary messages). */
+
+ if (msglen > sizeof(__tmp) &&
+ msg == (struct dlm_header *) __tmp) {
+ msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
+ if (msg == NULL)
+ return ret;
+ }
+
+ copy_from_cb(msg, base, offset, msglen, limit);
+
+ BUG_ON(lockspace != msg->h_lockspace);
+
+ ret += msglen;
+ offset += msglen;
+ offset &= (limit - 1);
+ len -= msglen;
+
+ switch (msg->h_cmd) {
+ case DLM_MSG:
+ dlm_receive_message(msg, nodeid, 0);
+ break;
+
+ case DLM_RCOM:
+ dlm_receive_rcom(msg, nodeid);
+ break;
+
+ default:
+ log_print("unknown msg type %x from %u: %u %u %u %u",
+ msg->h_cmd, nodeid, msglen, len, offset, ret);
+ }
+ }
+
+ if (msg != (struct dlm_header *) __tmp)
+ kfree(msg);
+
+ return err ? err : ret;
+}
+
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
new file mode 100644
index 000000000000..95852a5f111d
--- /dev/null
+++ b/fs/dlm/midcomms.h
@@ -0,0 +1,21 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __MIDCOMMS_DOT_H__
+#define __MIDCOMMS_DOT_H__
+
+int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset,
+ unsigned len, unsigned limit);
+
+#endif /* __MIDCOMMS_DOT_H__ */
+
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
new file mode 100644
index 000000000000..518239a8b1e9
--- /dev/null
+++ b/fs/dlm/rcom.c
@@ -0,0 +1,472 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "lowcomms.h"
+#include "midcomms.h"
+#include "rcom.h"
+#include "recover.h"
+#include "dir.h"
+#include "config.h"
+#include "memory.h"
+#include "lock.h"
+#include "util.h"
+
+
+static int rcom_response(struct dlm_ls *ls)
+{
+ return test_bit(LSFL_RCOM_READY, &ls->ls_flags);
+}
+
+static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
+ struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret)
+{
+ struct dlm_rcom *rc;
+ struct dlm_mhandle *mh;
+ char *mb;
+ int mb_len = sizeof(struct dlm_rcom) + len;
+
+ mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
+ if (!mh) {
+ log_print("create_rcom to %d type %d len %d ENOBUFS",
+ to_nodeid, type, len);
+ return -ENOBUFS;
+ }
+ memset(mb, 0, mb_len);
+
+ rc = (struct dlm_rcom *) mb;
+
+ rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ rc->rc_header.h_lockspace = ls->ls_global_id;
+ rc->rc_header.h_nodeid = dlm_our_nodeid();
+ rc->rc_header.h_length = mb_len;
+ rc->rc_header.h_cmd = DLM_RCOM;
+
+ rc->rc_type = type;
+
+ *mh_ret = mh;
+ *rc_ret = rc;
+ return 0;
+}
+
+static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
+ struct dlm_rcom *rc)
+{
+ dlm_rcom_out(rc);
+ dlm_lowcomms_commit_buffer(mh);
+}
+
+/* When replying to a status request, a node also sends back its
+ configuration values. The requesting node then checks that the remote
+ node is configured the same way as itself. */
+
+static void make_config(struct dlm_ls *ls, struct rcom_config *rf)
+{
+ rf->rf_lvblen = ls->ls_lvblen;
+ rf->rf_lsflags = ls->ls_exflags;
+}
+
+static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid)
+{
+ if (rf->rf_lvblen != ls->ls_lvblen ||
+ rf->rf_lsflags != ls->ls_exflags) {
+ log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
+ ls->ls_lvblen, ls->ls_exflags,
+ nodeid, rf->rf_lvblen, rf->rf_lsflags);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
+{
+ struct dlm_rcom *rc;
+ struct dlm_mhandle *mh;
+ int error = 0;
+
+ memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
+ ls->ls_recover_nodeid = nodeid;
+
+ if (nodeid == dlm_our_nodeid()) {
+ rc = (struct dlm_rcom *) ls->ls_recover_buf;
+ rc->rc_result = dlm_recover_status(ls);
+ goto out;
+ }
+
+ error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh);
+ if (error)
+ goto out;
+ rc->rc_id = ++ls->ls_rcom_seq;
+
+ send_rcom(ls, mh, rc);
+
+ error = dlm_wait_function(ls, &rcom_response);
+ clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
+ if (error)
+ goto out;
+
+ rc = (struct dlm_rcom *) ls->ls_recover_buf;
+
+ if (rc->rc_result == -ESRCH) {
+ /* we pretend the remote lockspace exists with 0 status */
+ log_debug(ls, "remote node %d not ready", nodeid);
+ rc->rc_result = 0;
+ } else
+ error = check_config(ls, (struct rcom_config *) rc->rc_buf,
+ nodeid);
+ /* the caller looks at rc_result for the remote recovery status */
+ out:
+ return error;
+}
+
+static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+ struct dlm_rcom *rc;
+ struct dlm_mhandle *mh;
+ int error, nodeid = rc_in->rc_header.h_nodeid;
+
+ error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
+ sizeof(struct rcom_config), &rc, &mh);
+ if (error)
+ return;
+ rc->rc_id = rc_in->rc_id;
+ rc->rc_result = dlm_recover_status(ls);
+ make_config(ls, (struct rcom_config *) rc->rc_buf);
+
+ send_rcom(ls, mh, rc);
+}
+
+static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+ if (rc_in->rc_id != ls->ls_rcom_seq) {
+ log_debug(ls, "reject old reply %d got %llx wanted %llx",
+ rc_in->rc_type, rc_in->rc_id, ls->ls_rcom_seq);
+ return;
+ }
+ memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
+ set_bit(LSFL_RCOM_READY, &ls->ls_flags);
+ wake_up(&ls->ls_wait_general);
+}
+
+static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+ receive_sync_reply(ls, rc_in);
+}
+
+int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
+{
+ struct dlm_rcom *rc;
+ struct dlm_mhandle *mh;
+ int error = 0, len = sizeof(struct dlm_rcom);
+
+ memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
+ ls->ls_recover_nodeid = nodeid;
+
+ if (nodeid == dlm_our_nodeid()) {
+ dlm_copy_master_names(ls, last_name, last_len,
+ ls->ls_recover_buf + len,
+ dlm_config.buffer_size - len, nodeid);
+ goto out;
+ }
+
+ error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh);
+ if (error)
+ goto out;
+ memcpy(rc->rc_buf, last_name, last_len);
+ rc->rc_id = ++ls->ls_rcom_seq;
+
+ send_rcom(ls, mh, rc);
+
+ error = dlm_wait_function(ls, &rcom_response);
+ clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
+ out:
+ return error;
+}
+
+static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+ struct dlm_rcom *rc;
+ struct dlm_mhandle *mh;
+ int error, inlen, outlen;
+ int nodeid = rc_in->rc_header.h_nodeid;
+ uint32_t status = dlm_recover_status(ls);
+
+ /*
+ * We can't run dlm_dir_rebuild_send (which uses ls_nodes) while
+ * dlm_recoverd is running ls_nodes_reconfig (which changes ls_nodes).
+ * It could only happen in rare cases where we get a late NAMES
+ * message from a previous instance of recovery.
+ */
+
+ if (!(status & DLM_RS_NODES)) {
+ log_debug(ls, "ignoring RCOM_NAMES from %u", nodeid);
+ return;
+ }
+
+ nodeid = rc_in->rc_header.h_nodeid;
+ inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
+ outlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
+
+ error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh);
+ if (error)
+ return;
+ rc->rc_id = rc_in->rc_id;
+
+ dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
+ nodeid);
+ send_rcom(ls, mh, rc);
+}
+
+static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+ receive_sync_reply(ls, rc_in);
+}
+
+int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
+{
+ struct dlm_rcom *rc;
+ struct dlm_mhandle *mh;
+ struct dlm_ls *ls = r->res_ls;
+ int error;
+
+ error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length,
+ &rc, &mh);
+ if (error)
+ goto out;
+ memcpy(rc->rc_buf, r->res_name, r->res_length);
+ rc->rc_id = (unsigned long) r;
+
+ send_rcom(ls, mh, rc);
+ out:
+ return error;
+}
+
+static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+ struct dlm_rcom *rc;
+ struct dlm_mhandle *mh;
+ int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid;
+ int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
+
+ error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh);
+ if (error)
+ return;
+
+ error = dlm_dir_lookup(ls, nodeid, rc_in->rc_buf, len, &ret_nodeid);
+ if (error)
+ ret_nodeid = error;
+ rc->rc_result = ret_nodeid;
+ rc->rc_id = rc_in->rc_id;
+
+ send_rcom(ls, mh, rc);
+}
+
+static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+ dlm_recover_master_reply(ls, rc_in);
+}
+
+static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ struct rcom_lock *rl)
+{
+ memset(rl, 0, sizeof(*rl));
+
+ rl->rl_ownpid = lkb->lkb_ownpid;
+ rl->rl_lkid = lkb->lkb_id;
+ rl->rl_exflags = lkb->lkb_exflags;
+ rl->rl_flags = lkb->lkb_flags;
+ rl->rl_lvbseq = lkb->lkb_lvbseq;
+ rl->rl_rqmode = lkb->lkb_rqmode;
+ rl->rl_grmode = lkb->lkb_grmode;
+ rl->rl_status = lkb->lkb_status;
+ rl->rl_wait_type = lkb->lkb_wait_type;
+
+ if (lkb->lkb_bastaddr)
+ rl->rl_asts |= AST_BAST;
+ if (lkb->lkb_astaddr)
+ rl->rl_asts |= AST_COMP;
+
+ rl->rl_namelen = r->res_length;
+ memcpy(rl->rl_name, r->res_name, r->res_length);
+
+ /* FIXME: might we have an lvb without DLM_LKF_VALBLK set ?
+ If so, receive_rcom_lock_args() won't take this copy. */
+
+ if (lkb->lkb_lvbptr)
+ memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+}
+
+int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ struct dlm_ls *ls = r->res_ls;
+ struct dlm_rcom *rc;
+ struct dlm_mhandle *mh;
+ struct rcom_lock *rl;
+ int error, len = sizeof(struct rcom_lock);
+
+ if (lkb->lkb_lvbptr)
+ len += ls->ls_lvblen;
+
+ error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh);
+ if (error)
+ goto out;
+
+ rl = (struct rcom_lock *) rc->rc_buf;
+ pack_rcom_lock(r, lkb, rl);
+ rc->rc_id = (unsigned long) r;
+
+ send_rcom(ls, mh, rc);
+ out:
+ return error;
+}
+
+static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+ struct dlm_rcom *rc;
+ struct dlm_mhandle *mh;
+ int error, nodeid = rc_in->rc_header.h_nodeid;
+
+ dlm_recover_master_copy(ls, rc_in);
+
+ error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY,
+ sizeof(struct rcom_lock), &rc, &mh);
+ if (error)
+ return;
+
+ /* We send back the same rcom_lock struct we received, but
+ dlm_recover_master_copy() has filled in rl_remid and rl_result */
+
+ memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock));
+ rc->rc_id = rc_in->rc_id;
+
+ send_rcom(ls, mh, rc);
+}
+
+static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+ uint32_t status = dlm_recover_status(ls);
+
+ if (!(status & DLM_RS_DIR)) {
+ log_debug(ls, "ignoring RCOM_LOCK_REPLY from %u",
+ rc_in->rc_header.h_nodeid);
+ return;
+ }
+
+ dlm_recover_process_copy(ls, rc_in);
+}
+
+static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
+{
+ struct dlm_rcom *rc;
+ struct dlm_mhandle *mh;
+ char *mb;
+ int mb_len = sizeof(struct dlm_rcom);
+
+ mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_KERNEL, &mb);
+ if (!mh)
+ return -ENOBUFS;
+ memset(mb, 0, mb_len);
+
+ rc = (struct dlm_rcom *) mb;
+
+ rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace;
+ rc->rc_header.h_nodeid = dlm_our_nodeid();
+ rc->rc_header.h_length = mb_len;
+ rc->rc_header.h_cmd = DLM_RCOM;
+
+ rc->rc_type = DLM_RCOM_STATUS_REPLY;
+ rc->rc_id = rc_in->rc_id;
+ rc->rc_result = -ESRCH;
+
+ dlm_rcom_out(rc);
+ dlm_lowcomms_commit_buffer(mh);
+
+ return 0;
+}
+
+/* Called by dlm_recvd; corresponds to dlm_receive_message() but special
+ recovery-only comms are sent through here. */
+
+void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
+{
+ struct dlm_rcom *rc = (struct dlm_rcom *) hd;
+ struct dlm_ls *ls;
+
+ dlm_rcom_in(rc);
+
+ /* If the lockspace doesn't exist then still send a status message
+ back; it's possible that it just doesn't have its global_id yet. */
+
+ ls = dlm_find_lockspace_global(hd->h_lockspace);
+ if (!ls) {
+ log_print("lockspace %x from %d not found",
+ hd->h_lockspace, nodeid);
+ send_ls_not_ready(nodeid, rc);
+ return;
+ }
+
+ if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
+ log_error(ls, "ignoring recovery message %x from %d",
+ rc->rc_type, nodeid);
+ goto out;
+ }
+
+ if (nodeid != rc->rc_header.h_nodeid) {
+ log_error(ls, "bad rcom nodeid %d from %d",
+ rc->rc_header.h_nodeid, nodeid);
+ goto out;
+ }
+
+ switch (rc->rc_type) {
+ case DLM_RCOM_STATUS:
+ receive_rcom_status(ls, rc);
+ break;
+
+ case DLM_RCOM_NAMES:
+ receive_rcom_names(ls, rc);
+ break;
+
+ case DLM_RCOM_LOOKUP:
+ receive_rcom_lookup(ls, rc);
+ break;
+
+ case DLM_RCOM_LOCK:
+ receive_rcom_lock(ls, rc);
+ break;
+
+ case DLM_RCOM_STATUS_REPLY:
+ receive_rcom_status_reply(ls, rc);
+ break;
+
+ case DLM_RCOM_NAMES_REPLY:
+ receive_rcom_names_reply(ls, rc);
+ break;
+
+ case DLM_RCOM_LOOKUP_REPLY:
+ receive_rcom_lookup_reply(ls, rc);
+ break;
+
+ case DLM_RCOM_LOCK_REPLY:
+ receive_rcom_lock_reply(ls, rc);
+ break;
+
+ default:
+ DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type););
+ }
+ out:
+ dlm_put_lockspace(ls);
+}
+
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
new file mode 100644
index 000000000000..d7984321ff41
--- /dev/null
+++ b/fs/dlm/rcom.h
@@ -0,0 +1,24 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RCOM_DOT_H__
+#define __RCOM_DOT_H__
+
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid);
+int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
+int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
+int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+void dlm_receive_rcom(struct dlm_header *hd, int nodeid);
+
+#endif
+
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
new file mode 100644
index 000000000000..a5e6d184872e
--- /dev/null
+++ b/fs/dlm/recover.c
@@ -0,0 +1,765 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "dir.h"
+#include "config.h"
+#include "ast.h"
+#include "memory.h"
+#include "rcom.h"
+#include "lock.h"
+#include "lowcomms.h"
+#include "member.h"
+#include "recover.h"
+
+
+/*
+ * Recovery waiting routines: these functions wait for a particular reply from
+ * a remote node, or for the remote node to report a certain status. They need
+ * to abort if the lockspace is stopped indicating a node has failed (perhaps
+ * the one being waited for).
+ */
+
+/*
+ * Wait until given function returns non-zero or lockspace is stopped
+ * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes). When another
+ * function thinks it could have completed the waited-on task, they should wake
+ * up ls_wait_general to get an immediate response rather than waiting for the
+ * timer to detect the result. A timer wakes us up periodically while waiting
+ * to see if we should abort due to a node failure. This should only be called
+ * by the dlm_recoverd thread.
+ */
+
+static void dlm_wait_timer_fn(unsigned long data)
+{
+ struct dlm_ls *ls = (struct dlm_ls *) data;
+ mod_timer(&ls->ls_timer, jiffies + (dlm_config.recover_timer * HZ));
+ wake_up(&ls->ls_wait_general);
+}
+
+int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
+{
+ int error = 0;
+
+ init_timer(&ls->ls_timer);
+ ls->ls_timer.function = dlm_wait_timer_fn;
+ ls->ls_timer.data = (long) ls;
+ ls->ls_timer.expires = jiffies + (dlm_config.recover_timer * HZ);
+ add_timer(&ls->ls_timer);
+
+ wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls));
+ del_timer_sync(&ls->ls_timer);
+
+ if (dlm_recovery_stopped(ls)) {
+ log_debug(ls, "dlm_wait_function aborted");
+ error = -EINTR;
+ }
+ return error;
+}
+
+/*
+ * An efficient way for all nodes to wait for all others to have a certain
+ * status. The node with the lowest nodeid polls all the others for their
+ * status (wait_status_all) and all the others poll the node with the low id
+ * for its accumulated result (wait_status_low). When all nodes have set
+ * status flag X, then status flag X_ALL will be set on the low nodeid.
+ */
+
+uint32_t dlm_recover_status(struct dlm_ls *ls)
+{
+ uint32_t status;
+ spin_lock(&ls->ls_recover_lock);
+ status = ls->ls_recover_status;
+ spin_unlock(&ls->ls_recover_lock);
+ return status;
+}
+
+void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
+{
+ spin_lock(&ls->ls_recover_lock);
+ ls->ls_recover_status |= status;
+ spin_unlock(&ls->ls_recover_lock);
+}
+
+static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
+{
+ struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
+ struct dlm_member *memb;
+ int error = 0, delay;
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ delay = 0;
+ for (;;) {
+ if (dlm_recovery_stopped(ls)) {
+ error = -EINTR;
+ goto out;
+ }
+
+ error = dlm_rcom_status(ls, memb->nodeid);
+ if (error)
+ goto out;
+
+ if (rc->rc_result & wait_status)
+ break;
+ if (delay < 1000)
+ delay += 20;
+ msleep(delay);
+ }
+ }
+ out:
+ return error;
+}
+
+static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
+{
+ struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
+ int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
+
+ for (;;) {
+ if (dlm_recovery_stopped(ls)) {
+ error = -EINTR;
+ goto out;
+ }
+
+ error = dlm_rcom_status(ls, nodeid);
+ if (error)
+ break;
+
+ if (rc->rc_result & wait_status)
+ break;
+ if (delay < 1000)
+ delay += 20;
+ msleep(delay);
+ }
+ out:
+ return error;
+}
+
+static int wait_status(struct dlm_ls *ls, uint32_t status)
+{
+ uint32_t status_all = status << 1;
+ int error;
+
+ if (ls->ls_low_nodeid == dlm_our_nodeid()) {
+ error = wait_status_all(ls, status);
+ if (!error)
+ dlm_set_recover_status(ls, status_all);
+ } else
+ error = wait_status_low(ls, status_all);
+
+ return error;
+}
+
+int dlm_recover_members_wait(struct dlm_ls *ls)
+{
+ return wait_status(ls, DLM_RS_NODES);
+}
+
+int dlm_recover_directory_wait(struct dlm_ls *ls)
+{
+ return wait_status(ls, DLM_RS_DIR);
+}
+
+int dlm_recover_locks_wait(struct dlm_ls *ls)
+{
+ return wait_status(ls, DLM_RS_LOCKS);
+}
+
+int dlm_recover_done_wait(struct dlm_ls *ls)
+{
+ return wait_status(ls, DLM_RS_DONE);
+}
+
+/*
+ * The recover_list contains all the rsb's for which we've requested the new
+ * master nodeid. As replies are returned from the resource directories the
+ * rsb's are removed from the list. When the list is empty we're done.
+ *
+ * The recover_list is later similarly used for all rsb's for which we've sent
+ * new lkb's and need to receive new corresponding lkid's.
+ *
+ * We use the address of the rsb struct as a simple local identifier for the
+ * rsb so we can match an rcom reply with the rsb it was sent for.
+ */
+
+static int recover_list_empty(struct dlm_ls *ls)
+{
+ int empty;
+
+ spin_lock(&ls->ls_recover_list_lock);
+ empty = list_empty(&ls->ls_recover_list);
+ spin_unlock(&ls->ls_recover_list_lock);
+
+ return empty;
+}
+
+static void recover_list_add(struct dlm_rsb *r)
+{
+ struct dlm_ls *ls = r->res_ls;
+
+ spin_lock(&ls->ls_recover_list_lock);
+ if (list_empty(&r->res_recover_list)) {
+ list_add_tail(&r->res_recover_list, &ls->ls_recover_list);
+ ls->ls_recover_list_count++;
+ dlm_hold_rsb(r);
+ }
+ spin_unlock(&ls->ls_recover_list_lock);
+}
+
+static void recover_list_del(struct dlm_rsb *r)
+{
+ struct dlm_ls *ls = r->res_ls;
+
+ spin_lock(&ls->ls_recover_list_lock);
+ list_del_init(&r->res_recover_list);
+ ls->ls_recover_list_count--;
+ spin_unlock(&ls->ls_recover_list_lock);
+
+ dlm_put_rsb(r);
+}
+
+static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, uint64_t id)
+{
+ struct dlm_rsb *r = NULL;
+
+ spin_lock(&ls->ls_recover_list_lock);
+
+ list_for_each_entry(r, &ls->ls_recover_list, res_recover_list) {
+ if (id == (unsigned long) r)
+ goto out;
+ }
+ r = NULL;
+ out:
+ spin_unlock(&ls->ls_recover_list_lock);
+ return r;
+}
+
+static void recover_list_clear(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r, *s;
+
+ spin_lock(&ls->ls_recover_list_lock);
+ list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) {
+ list_del_init(&r->res_recover_list);
+ dlm_put_rsb(r);
+ ls->ls_recover_list_count--;
+ }
+
+ if (ls->ls_recover_list_count != 0) {
+ log_error(ls, "warning: recover_list_count %d",
+ ls->ls_recover_list_count);
+ ls->ls_recover_list_count = 0;
+ }
+ spin_unlock(&ls->ls_recover_list_lock);
+}
+
+
+/* Master recovery: find new master node for rsb's that were
+ mastered on nodes that have been removed.
+
+ dlm_recover_masters
+ recover_master
+ dlm_send_rcom_lookup -> receive_rcom_lookup
+ dlm_dir_lookup
+ receive_rcom_lookup_reply <-
+ dlm_recover_master_reply
+ set_new_master
+ set_master_lkbs
+ set_lock_master
+*/
+
+/*
+ * Set the lock master for all LKBs in a lock queue
+ * If we are the new master of the rsb, we may have received new
+ * MSTCPY locks from other nodes already which we need to ignore
+ * when setting the new nodeid.
+ */
+
+static void set_lock_master(struct list_head *queue, int nodeid)
+{
+ struct dlm_lkb *lkb;
+
+ list_for_each_entry(lkb, queue, lkb_statequeue)
+ if (!(lkb->lkb_flags & DLM_IFL_MSTCPY))
+ lkb->lkb_nodeid = nodeid;
+}
+
+static void set_master_lkbs(struct dlm_rsb *r)
+{
+ set_lock_master(&r->res_grantqueue, r->res_nodeid);
+ set_lock_master(&r->res_convertqueue, r->res_nodeid);
+ set_lock_master(&r->res_waitqueue, r->res_nodeid);
+}
+
+/*
+ * Propogate the new master nodeid to locks
+ * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
+ * The NEW_MASTER2 flag tells recover_lvb() and set_locks_purged() which
+ * rsb's to consider.
+ */
+
+static void set_new_master(struct dlm_rsb *r, int nodeid)
+{
+ lock_rsb(r);
+ r->res_nodeid = nodeid;
+ set_master_lkbs(r);
+ rsb_set_flag(r, RSB_NEW_MASTER);
+ rsb_set_flag(r, RSB_NEW_MASTER2);
+ unlock_rsb(r);
+}
+
+/*
+ * We do async lookups on rsb's that need new masters. The rsb's
+ * waiting for a lookup reply are kept on the recover_list.
+ */
+
+static int recover_master(struct dlm_rsb *r)
+{
+ struct dlm_ls *ls = r->res_ls;
+ int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
+
+ dir_nodeid = dlm_dir_nodeid(r);
+
+ if (dir_nodeid == our_nodeid) {
+ error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
+ r->res_length, &ret_nodeid);
+ if (error)
+ log_error(ls, "recover dir lookup error %d", error);
+
+ if (ret_nodeid == our_nodeid)
+ ret_nodeid = 0;
+ set_new_master(r, ret_nodeid);
+ } else {
+ recover_list_add(r);
+ error = dlm_send_rcom_lookup(r, dir_nodeid);
+ }
+
+ return error;
+}
+
+/*
+ * When not using a directory, most resource names will hash to a new static
+ * master nodeid and the resource will need to be remastered.
+ */
+
+static int recover_master_static(struct dlm_rsb *r)
+{
+ int master = dlm_dir_nodeid(r);
+
+ if (master == dlm_our_nodeid())
+ master = 0;
+
+ if (r->res_nodeid != master) {
+ if (is_master(r))
+ dlm_purge_mstcpy_locks(r);
+ set_new_master(r, master);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Go through local root resources and for each rsb which has a master which
+ * has departed, get the new master nodeid from the directory. The dir will
+ * assign mastery to the first node to look up the new master. That means
+ * we'll discover in this lookup if we're the new master of any rsb's.
+ *
+ * We fire off all the dir lookup requests individually and asynchronously to
+ * the correct dir node.
+ */
+
+int dlm_recover_masters(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r;
+ int error = 0, count = 0;
+
+ log_debug(ls, "dlm_recover_masters");
+
+ down_read(&ls->ls_root_sem);
+ list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+ if (dlm_recovery_stopped(ls)) {
+ up_read(&ls->ls_root_sem);
+ error = -EINTR;
+ goto out;
+ }
+
+ if (dlm_no_directory(ls))
+ count += recover_master_static(r);
+ else if (!is_master(r) && dlm_is_removed(ls, r->res_nodeid)) {
+ recover_master(r);
+ count++;
+ }
+
+ schedule();
+ }
+ up_read(&ls->ls_root_sem);
+
+ log_debug(ls, "dlm_recover_masters %d resources", count);
+
+ error = dlm_wait_function(ls, &recover_list_empty);
+ out:
+ if (error)
+ recover_list_clear(ls);
+ return error;
+}
+
+int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+ struct dlm_rsb *r;
+ int nodeid;
+
+ r = recover_list_find(ls, rc->rc_id);
+ if (!r) {
+ log_error(ls, "dlm_recover_master_reply no id %llx",
+ (unsigned long long)rc->rc_id);
+ goto out;
+ }
+
+ nodeid = rc->rc_result;
+ if (nodeid == dlm_our_nodeid())
+ nodeid = 0;
+
+ set_new_master(r, nodeid);
+ recover_list_del(r);
+
+ if (recover_list_empty(ls))
+ wake_up(&ls->ls_wait_general);
+ out:
+ return 0;
+}
+
+
+/* Lock recovery: rebuild the process-copy locks we hold on a
+ remastered rsb on the new rsb master.
+
+ dlm_recover_locks
+ recover_locks
+ recover_locks_queue
+ dlm_send_rcom_lock -> receive_rcom_lock
+ dlm_recover_master_copy
+ receive_rcom_lock_reply <-
+ dlm_recover_process_copy
+*/
+
+
+/*
+ * keep a count of the number of lkb's we send to the new master; when we get
+ * an equal number of replies then recovery for the rsb is done
+ */
+
+static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head)
+{
+ struct dlm_lkb *lkb;
+ int error = 0;
+
+ list_for_each_entry(lkb, head, lkb_statequeue) {
+ error = dlm_send_rcom_lock(r, lkb);
+ if (error)
+ break;
+ r->res_recover_locks_count++;
+ }
+
+ return error;
+}
+
+static int recover_locks(struct dlm_rsb *r)
+{
+ int error = 0;
+
+ lock_rsb(r);
+
+ DLM_ASSERT(!r->res_recover_locks_count, dlm_dump_rsb(r););
+
+ error = recover_locks_queue(r, &r->res_grantqueue);
+ if (error)
+ goto out;
+ error = recover_locks_queue(r, &r->res_convertqueue);
+ if (error)
+ goto out;
+ error = recover_locks_queue(r, &r->res_waitqueue);
+ if (error)
+ goto out;
+
+ if (r->res_recover_locks_count)
+ recover_list_add(r);
+ else
+ rsb_clear_flag(r, RSB_NEW_MASTER);
+ out:
+ unlock_rsb(r);
+ return error;
+}
+
+int dlm_recover_locks(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r;
+ int error, count = 0;
+
+ log_debug(ls, "dlm_recover_locks");
+
+ down_read(&ls->ls_root_sem);
+ list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+ if (is_master(r)) {
+ rsb_clear_flag(r, RSB_NEW_MASTER);
+ continue;
+ }
+
+ if (!rsb_flag(r, RSB_NEW_MASTER))
+ continue;
+
+ if (dlm_recovery_stopped(ls)) {
+ error = -EINTR;
+ up_read(&ls->ls_root_sem);
+ goto out;
+ }
+
+ error = recover_locks(r);
+ if (error) {
+ up_read(&ls->ls_root_sem);
+ goto out;
+ }
+
+ count += r->res_recover_locks_count;
+ }
+ up_read(&ls->ls_root_sem);
+
+ log_debug(ls, "dlm_recover_locks %d locks", count);
+
+ error = dlm_wait_function(ls, &recover_list_empty);
+ out:
+ if (error)
+ recover_list_clear(ls);
+ else
+ dlm_set_recover_status(ls, DLM_RS_LOCKS);
+ return error;
+}
+
+void dlm_recovered_lock(struct dlm_rsb *r)
+{
+ DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_dump_rsb(r););
+
+ r->res_recover_locks_count--;
+ if (!r->res_recover_locks_count) {
+ rsb_clear_flag(r, RSB_NEW_MASTER);
+ recover_list_del(r);
+ }
+
+ if (recover_list_empty(r->res_ls))
+ wake_up(&r->res_ls->ls_wait_general);
+}
+
+/*
+ * The lvb needs to be recovered on all master rsb's. This includes setting
+ * the VALNOTVALID flag if necessary, and determining the correct lvb contents
+ * based on the lvb's of the locks held on the rsb.
+ *
+ * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb. If it
+ * was already set prior to recovery, it's not cleared, regardless of locks.
+ *
+ * The LVB contents are only considered for changing when this is a new master
+ * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with
+ * mode > CR. If no lkb's exist with mode above CR, the lvb contents are taken
+ * from the lkb with the largest lvb sequence number.
+ */
+
+static void recover_lvb(struct dlm_rsb *r)
+{
+ struct dlm_lkb *lkb, *high_lkb = NULL;
+ uint32_t high_seq = 0;
+ int lock_lvb_exists = 0;
+ int big_lock_exists = 0;
+ int lvblen = r->res_ls->ls_lvblen;
+
+ list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+ if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ continue;
+
+ lock_lvb_exists = 1;
+
+ if (lkb->lkb_grmode > DLM_LOCK_CR) {
+ big_lock_exists = 1;
+ goto setflag;
+ }
+
+ if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
+ high_lkb = lkb;
+ high_seq = lkb->lkb_lvbseq;
+ }
+ }
+
+ list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+ if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ continue;
+
+ lock_lvb_exists = 1;
+
+ if (lkb->lkb_grmode > DLM_LOCK_CR) {
+ big_lock_exists = 1;
+ goto setflag;
+ }
+
+ if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
+ high_lkb = lkb;
+ high_seq = lkb->lkb_lvbseq;
+ }
+ }
+
+ setflag:
+ if (!lock_lvb_exists)
+ goto out;
+
+ if (!big_lock_exists)
+ rsb_set_flag(r, RSB_VALNOTVALID);
+
+ /* don't mess with the lvb unless we're the new master */
+ if (!rsb_flag(r, RSB_NEW_MASTER2))
+ goto out;
+
+ if (!r->res_lvbptr) {
+ r->res_lvbptr = allocate_lvb(r->res_ls);
+ if (!r->res_lvbptr)
+ goto out;
+ }
+
+ if (big_lock_exists) {
+ r->res_lvbseq = lkb->lkb_lvbseq;
+ memcpy(r->res_lvbptr, lkb->lkb_lvbptr, lvblen);
+ } else if (high_lkb) {
+ r->res_lvbseq = high_lkb->lkb_lvbseq;
+ memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen);
+ } else {
+ r->res_lvbseq = 0;
+ memset(r->res_lvbptr, 0, lvblen);
+ }
+ out:
+ return;
+}
+
+/* All master rsb's flagged RECOVER_CONVERT need to be looked at. The locks
+ converting PR->CW or CW->PR need to have their lkb_grmode set. */
+
+static void recover_conversion(struct dlm_rsb *r)
+{
+ struct dlm_lkb *lkb;
+ int grmode = -1;
+
+ list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+ if (lkb->lkb_grmode == DLM_LOCK_PR ||
+ lkb->lkb_grmode == DLM_LOCK_CW) {
+ grmode = lkb->lkb_grmode;
+ break;
+ }
+ }
+
+ list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+ if (lkb->lkb_grmode != DLM_LOCK_IV)
+ continue;
+ if (grmode == -1)
+ lkb->lkb_grmode = lkb->lkb_rqmode;
+ else
+ lkb->lkb_grmode = grmode;
+ }
+}
+
+/* We've become the new master for this rsb and waiting/converting locks may
+ need to be granted in dlm_grant_after_purge() due to locks that may have
+ existed from a removed node. */
+
+static void set_locks_purged(struct dlm_rsb *r)
+{
+ if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue))
+ rsb_set_flag(r, RSB_LOCKS_PURGED);
+}
+
+void dlm_recover_rsbs(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r;
+ int count = 0;
+
+ log_debug(ls, "dlm_recover_rsbs");
+
+ down_read(&ls->ls_root_sem);
+ list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+ lock_rsb(r);
+ if (is_master(r)) {
+ if (rsb_flag(r, RSB_RECOVER_CONVERT))
+ recover_conversion(r);
+ if (rsb_flag(r, RSB_NEW_MASTER2))
+ set_locks_purged(r);
+ recover_lvb(r);
+ count++;
+ }
+ rsb_clear_flag(r, RSB_RECOVER_CONVERT);
+ rsb_clear_flag(r, RSB_NEW_MASTER2);
+ unlock_rsb(r);
+ }
+ up_read(&ls->ls_root_sem);
+
+ log_debug(ls, "dlm_recover_rsbs %d rsbs", count);
+}
+
+/* Create a single list of all root rsb's to be used during recovery */
+
+int dlm_create_root_list(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r;
+ int i, error = 0;
+
+ down_write(&ls->ls_root_sem);
+ if (!list_empty(&ls->ls_root_list)) {
+ log_error(ls, "root list not empty");
+ error = -EINVAL;
+ goto out;
+ }
+
+ for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+ read_lock(&ls->ls_rsbtbl[i].lock);
+ list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
+ list_add(&r->res_root_list, &ls->ls_root_list);
+ dlm_hold_rsb(r);
+ }
+ read_unlock(&ls->ls_rsbtbl[i].lock);
+ }
+ out:
+ up_write(&ls->ls_root_sem);
+ return error;
+}
+
+void dlm_release_root_list(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r, *safe;
+
+ down_write(&ls->ls_root_sem);
+ list_for_each_entry_safe(r, safe, &ls->ls_root_list, res_root_list) {
+ list_del_init(&r->res_root_list);
+ dlm_put_rsb(r);
+ }
+ up_write(&ls->ls_root_sem);
+}
+
+void dlm_clear_toss_list(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r, *safe;
+ int i;
+
+ for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+ write_lock(&ls->ls_rsbtbl[i].lock);
+ list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
+ res_hashchain) {
+ list_del(&r->res_hashchain);
+ free_rsb(r);
+ }
+ write_unlock(&ls->ls_rsbtbl[i].lock);
+ }
+}
+
diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h
new file mode 100644
index 000000000000..ebd0363f1e08
--- /dev/null
+++ b/fs/dlm/recover.h
@@ -0,0 +1,34 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RECOVER_DOT_H__
+#define __RECOVER_DOT_H__
+
+int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls));
+uint32_t dlm_recover_status(struct dlm_ls *ls);
+void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status);
+int dlm_recover_members_wait(struct dlm_ls *ls);
+int dlm_recover_directory_wait(struct dlm_ls *ls);
+int dlm_recover_locks_wait(struct dlm_ls *ls);
+int dlm_recover_done_wait(struct dlm_ls *ls);
+int dlm_recover_masters(struct dlm_ls *ls);
+int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc);
+int dlm_recover_locks(struct dlm_ls *ls);
+void dlm_recovered_lock(struct dlm_rsb *r);
+int dlm_create_root_list(struct dlm_ls *ls);
+void dlm_release_root_list(struct dlm_ls *ls);
+void dlm_clear_toss_list(struct dlm_ls *ls);
+void dlm_recover_rsbs(struct dlm_ls *ls);
+
+#endif /* __RECOVER_DOT_H__ */
+
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
new file mode 100644
index 000000000000..362e3eff4dc9
--- /dev/null
+++ b/fs/dlm/recoverd.c
@@ -0,0 +1,290 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "dir.h"
+#include "ast.h"
+#include "recover.h"
+#include "lowcomms.h"
+#include "lock.h"
+#include "requestqueue.h"
+#include "recoverd.h"
+
+
+/* If the start for which we're re-enabling locking (seq) has been superseded
+ by a newer stop (ls_recover_seq), we need to leave locking disabled. */
+
+static int enable_locking(struct dlm_ls *ls, uint64_t seq)
+{
+ int error = -EINTR;
+
+ spin_lock(&ls->ls_recover_lock);
+ if (ls->ls_recover_seq == seq) {
+ set_bit(LSFL_RUNNING, &ls->ls_flags);
+ up_write(&ls->ls_in_recovery);
+ error = 0;
+ }
+ spin_unlock(&ls->ls_recover_lock);
+ return error;
+}
+
+static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
+{
+ unsigned long start;
+ int error, neg = 0;
+
+ log_debug(ls, "recover %llx", rv->seq);
+
+ mutex_lock(&ls->ls_recoverd_active);
+
+ /*
+ * Suspending and resuming dlm_astd ensures that no lkb's from this ls
+ * will be processed by dlm_astd during recovery.
+ */
+
+ dlm_astd_suspend();
+ dlm_astd_resume();
+
+ /*
+ * This list of root rsb's will be the basis of most of the recovery
+ * routines.
+ */
+
+ dlm_create_root_list(ls);
+
+ /*
+ * Free all the tossed rsb's so we don't have to recover them.
+ */
+
+ dlm_clear_toss_list(ls);
+
+ /*
+ * Add or remove nodes from the lockspace's ls_nodes list.
+ * Also waits for all nodes to complete dlm_recover_members.
+ */
+
+ error = dlm_recover_members(ls, rv, &neg);
+ if (error) {
+ log_error(ls, "recover_members failed %d", error);
+ goto fail;
+ }
+ start = jiffies;
+
+ /*
+ * Rebuild our own share of the directory by collecting from all other
+ * nodes their master rsb names that hash to us.
+ */
+
+ error = dlm_recover_directory(ls);
+ if (error) {
+ log_error(ls, "recover_directory failed %d", error);
+ goto fail;
+ }
+
+ /*
+ * Purge directory-related requests that are saved in requestqueue.
+ * All dir requests from before recovery are invalid now due to the dir
+ * rebuild and will be resent by the requesting nodes.
+ */
+
+ dlm_purge_requestqueue(ls);
+
+ /*
+ * Wait for all nodes to complete directory rebuild.
+ */
+
+ error = dlm_recover_directory_wait(ls);
+ if (error) {
+ log_error(ls, "recover_directory_wait failed %d", error);
+ goto fail;
+ }
+
+ /*
+ * We may have outstanding operations that are waiting for a reply from
+ * a failed node. Mark these to be resent after recovery. Unlock and
+ * cancel ops can just be completed.
+ */
+
+ dlm_recover_waiters_pre(ls);
+
+ error = dlm_recovery_stopped(ls);
+ if (error)
+ goto fail;
+
+ if (neg || dlm_no_directory(ls)) {
+ /*
+ * Clear lkb's for departed nodes.
+ */
+
+ dlm_purge_locks(ls);
+
+ /*
+ * Get new master nodeid's for rsb's that were mastered on
+ * departed nodes.
+ */
+
+ error = dlm_recover_masters(ls);
+ if (error) {
+ log_error(ls, "recover_masters failed %d", error);
+ goto fail;
+ }
+
+ /*
+ * Send our locks on remastered rsb's to the new masters.
+ */
+
+ error = dlm_recover_locks(ls);
+ if (error) {
+ log_error(ls, "recover_locks failed %d", error);
+ goto fail;
+ }
+
+ error = dlm_recover_locks_wait(ls);
+ if (error) {
+ log_error(ls, "recover_locks_wait failed %d", error);
+ goto fail;
+ }
+
+ /*
+ * Finalize state in master rsb's now that all locks can be
+ * checked. This includes conversion resolution and lvb
+ * settings.
+ */
+
+ dlm_recover_rsbs(ls);
+ }
+
+ dlm_release_root_list(ls);
+
+ dlm_set_recover_status(ls, DLM_RS_DONE);
+ error = dlm_recover_done_wait(ls);
+ if (error) {
+ log_error(ls, "recover_done_wait failed %d", error);
+ goto fail;
+ }
+
+ dlm_clear_members_gone(ls);
+
+ error = enable_locking(ls, rv->seq);
+ if (error) {
+ log_error(ls, "enable_locking failed %d", error);
+ goto fail;
+ }
+
+ error = dlm_process_requestqueue(ls);
+ if (error) {
+ log_error(ls, "process_requestqueue failed %d", error);
+ goto fail;
+ }
+
+ error = dlm_recover_waiters_post(ls);
+ if (error) {
+ log_error(ls, "recover_waiters_post failed %d", error);
+ goto fail;
+ }
+
+ dlm_grant_after_purge(ls);
+
+ dlm_astd_wake();
+
+ log_debug(ls, "recover %llx done: %u ms", rv->seq,
+ jiffies_to_msecs(jiffies - start));
+ mutex_unlock(&ls->ls_recoverd_active);
+
+ return 0;
+
+ fail:
+ dlm_release_root_list(ls);
+ log_debug(ls, "recover %llx error %d", rv->seq, error);
+ mutex_unlock(&ls->ls_recoverd_active);
+ return error;
+}
+
+static void do_ls_recovery(struct dlm_ls *ls)
+{
+ struct dlm_recover *rv = NULL;
+
+ spin_lock(&ls->ls_recover_lock);
+ rv = ls->ls_recover_args;
+ ls->ls_recover_args = NULL;
+ clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+ spin_unlock(&ls->ls_recover_lock);
+
+ if (rv) {
+ ls_recover(ls, rv);
+ kfree(rv->nodeids);
+ kfree(rv);
+ }
+}
+
+static int dlm_recoverd(void *arg)
+{
+ struct dlm_ls *ls;
+
+ ls = dlm_find_lockspace_local(arg);
+ if (!ls) {
+ log_print("dlm_recoverd: no lockspace %p", arg);
+ return -1;
+ }
+
+ while (!kthread_should_stop()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!test_bit(LSFL_WORK, &ls->ls_flags))
+ schedule();
+ set_current_state(TASK_RUNNING);
+
+ if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
+ do_ls_recovery(ls);
+ }
+
+ dlm_put_lockspace(ls);
+ return 0;
+}
+
+void dlm_recoverd_kick(struct dlm_ls *ls)
+{
+ set_bit(LSFL_WORK, &ls->ls_flags);
+ wake_up_process(ls->ls_recoverd_task);
+}
+
+int dlm_recoverd_start(struct dlm_ls *ls)
+{
+ struct task_struct *p;
+ int error = 0;
+
+ p = kthread_run(dlm_recoverd, ls, "dlm_recoverd");
+ if (IS_ERR(p))
+ error = PTR_ERR(p);
+ else
+ ls->ls_recoverd_task = p;
+ return error;
+}
+
+void dlm_recoverd_stop(struct dlm_ls *ls)
+{
+ kthread_stop(ls->ls_recoverd_task);
+}
+
+void dlm_recoverd_suspend(struct dlm_ls *ls)
+{
+ wake_up(&ls->ls_wait_general);
+ mutex_lock(&ls->ls_recoverd_active);
+}
+
+void dlm_recoverd_resume(struct dlm_ls *ls)
+{
+ mutex_unlock(&ls->ls_recoverd_active);
+}
+
diff --git a/fs/dlm/recoverd.h b/fs/dlm/recoverd.h
new file mode 100644
index 000000000000..866657c5d69d
--- /dev/null
+++ b/fs/dlm/recoverd.h
@@ -0,0 +1,24 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RECOVERD_DOT_H__
+#define __RECOVERD_DOT_H__
+
+void dlm_recoverd_kick(struct dlm_ls *ls);
+void dlm_recoverd_stop(struct dlm_ls *ls);
+int dlm_recoverd_start(struct dlm_ls *ls);
+void dlm_recoverd_suspend(struct dlm_ls *ls);
+void dlm_recoverd_resume(struct dlm_ls *ls);
+
+#endif /* __RECOVERD_DOT_H__ */
+
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
new file mode 100644
index 000000000000..7b2b089634a2
--- /dev/null
+++ b/fs/dlm/requestqueue.c
@@ -0,0 +1,184 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "member.h"
+#include "lock.h"
+#include "dir.h"
+#include "config.h"
+#include "requestqueue.h"
+
+struct rq_entry {
+ struct list_head list;
+ int nodeid;
+ char request[1];
+};
+
+/*
+ * Requests received while the lockspace is in recovery get added to the
+ * request queue and processed when recovery is complete. This happens when
+ * the lockspace is suspended on some nodes before it is on others, or the
+ * lockspace is enabled on some while still suspended on others.
+ */
+
+void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
+{
+ struct rq_entry *e;
+ int length = hd->h_length;
+
+ if (dlm_is_removed(ls, nodeid))
+ return;
+
+ e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
+ if (!e) {
+ log_print("dlm_add_requestqueue: out of memory\n");
+ return;
+ }
+
+ e->nodeid = nodeid;
+ memcpy(e->request, hd, length);
+
+ mutex_lock(&ls->ls_requestqueue_mutex);
+ list_add_tail(&e->list, &ls->ls_requestqueue);
+ mutex_unlock(&ls->ls_requestqueue_mutex);
+}
+
+int dlm_process_requestqueue(struct dlm_ls *ls)
+{
+ struct rq_entry *e;
+ struct dlm_header *hd;
+ int error = 0;
+
+ mutex_lock(&ls->ls_requestqueue_mutex);
+
+ for (;;) {
+ if (list_empty(&ls->ls_requestqueue)) {
+ mutex_unlock(&ls->ls_requestqueue_mutex);
+ error = 0;
+ break;
+ }
+ e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list);
+ mutex_unlock(&ls->ls_requestqueue_mutex);
+
+ hd = (struct dlm_header *) e->request;
+ error = dlm_receive_message(hd, e->nodeid, 1);
+
+ if (error == -EINTR) {
+ /* entry is left on requestqueue */
+ log_debug(ls, "process_requestqueue abort eintr");
+ break;
+ }
+
+ mutex_lock(&ls->ls_requestqueue_mutex);
+ list_del(&e->list);
+ kfree(e);
+
+ if (dlm_locking_stopped(ls)) {
+ log_debug(ls, "process_requestqueue abort running");
+ mutex_unlock(&ls->ls_requestqueue_mutex);
+ error = -EINTR;
+ break;
+ }
+ schedule();
+ }
+
+ return error;
+}
+
+/*
+ * After recovery is done, locking is resumed and dlm_recoverd takes all the
+ * saved requests and processes them as they would have been by dlm_recvd. At
+ * the same time, dlm_recvd will start receiving new requests from remote
+ * nodes. We want to delay dlm_recvd processing new requests until
+ * dlm_recoverd has finished processing the old saved requests.
+ */
+
+void dlm_wait_requestqueue(struct dlm_ls *ls)
+{
+ for (;;) {
+ mutex_lock(&ls->ls_requestqueue_mutex);
+ if (list_empty(&ls->ls_requestqueue))
+ break;
+ if (dlm_locking_stopped(ls))
+ break;
+ mutex_unlock(&ls->ls_requestqueue_mutex);
+ schedule();
+ }
+ mutex_unlock(&ls->ls_requestqueue_mutex);
+}
+
+static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
+{
+ uint32_t type = ms->m_type;
+
+ if (dlm_is_removed(ls, nodeid))
+ return 1;
+
+ /* directory operations are always purged because the directory is
+ always rebuilt during recovery and the lookups resent */
+
+ if (type == DLM_MSG_REMOVE ||
+ type == DLM_MSG_LOOKUP ||
+ type == DLM_MSG_LOOKUP_REPLY)
+ return 1;
+
+ if (!dlm_no_directory(ls))
+ return 0;
+
+ /* with no directory, the master is likely to change as a part of
+ recovery; requests to/from the defunct master need to be purged */
+
+ switch (type) {
+ case DLM_MSG_REQUEST:
+ case DLM_MSG_CONVERT:
+ case DLM_MSG_UNLOCK:
+ case DLM_MSG_CANCEL:
+ /* we're no longer the master of this resource, the sender
+ will resend to the new master (see waiter_needs_recovery) */
+
+ if (dlm_hash2nodeid(ls, ms->m_hash) != dlm_our_nodeid())
+ return 1;
+ break;
+
+ case DLM_MSG_REQUEST_REPLY:
+ case DLM_MSG_CONVERT_REPLY:
+ case DLM_MSG_UNLOCK_REPLY:
+ case DLM_MSG_CANCEL_REPLY:
+ case DLM_MSG_GRANT:
+ /* this reply is from the former master of the resource,
+ we'll resend to the new master if needed */
+
+ if (dlm_hash2nodeid(ls, ms->m_hash) != nodeid)
+ return 1;
+ break;
+ }
+
+ return 0;
+}
+
+void dlm_purge_requestqueue(struct dlm_ls *ls)
+{
+ struct dlm_message *ms;
+ struct rq_entry *e, *safe;
+
+ mutex_lock(&ls->ls_requestqueue_mutex);
+ list_for_each_entry_safe(e, safe, &ls->ls_requestqueue, list) {
+ ms = (struct dlm_message *) e->request;
+
+ if (purge_request(ls, ms, e->nodeid)) {
+ list_del(&e->list);
+ kfree(e);
+ }
+ }
+ mutex_unlock(&ls->ls_requestqueue_mutex);
+}
+
diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h
new file mode 100644
index 000000000000..349f0d292d95
--- /dev/null
+++ b/fs/dlm/requestqueue.h
@@ -0,0 +1,22 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __REQUESTQUEUE_DOT_H__
+#define __REQUESTQUEUE_DOT_H__
+
+void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
+int dlm_process_requestqueue(struct dlm_ls *ls);
+void dlm_wait_requestqueue(struct dlm_ls *ls);
+void dlm_purge_requestqueue(struct dlm_ls *ls);
+
+#endif
+
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
new file mode 100644
index 000000000000..c37e93e4f2df
--- /dev/null
+++ b/fs/dlm/user.c
@@ -0,0 +1,788 @@
+/*
+ * Copyright (C) 2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/module.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/signal.h>
+#include <linux/spinlock.h>
+#include <linux/dlm.h>
+#include <linux/dlm_device.h>
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "lock.h"
+#include "lvb_table.h"
+
+static const char *name_prefix="dlm";
+static struct miscdevice ctl_device;
+static struct file_operations device_fops;
+
+#ifdef CONFIG_COMPAT
+
+struct dlm_lock_params32 {
+ __u8 mode;
+ __u8 namelen;
+ __u16 flags;
+ __u32 lkid;
+ __u32 parent;
+
+ __u32 castparam;
+ __u32 castaddr;
+ __u32 bastparam;
+ __u32 bastaddr;
+ __u32 lksb;
+
+ char lvb[DLM_USER_LVB_LEN];
+ char name[0];
+};
+
+struct dlm_write_request32 {
+ __u32 version[3];
+ __u8 cmd;
+ __u8 is64bit;
+ __u8 unused[2];
+
+ union {
+ struct dlm_lock_params32 lock;
+ struct dlm_lspace_params lspace;
+ } i;
+};
+
+struct dlm_lksb32 {
+ __u32 sb_status;
+ __u32 sb_lkid;
+ __u8 sb_flags;
+ __u32 sb_lvbptr;
+};
+
+struct dlm_lock_result32 {
+ __u32 length;
+ __u32 user_astaddr;
+ __u32 user_astparam;
+ __u32 user_lksb;
+ struct dlm_lksb32 lksb;
+ __u8 bast_mode;
+ __u8 unused[3];
+ /* Offsets may be zero if no data is present */
+ __u32 lvb_offset;
+};
+
+static void compat_input(struct dlm_write_request *kb,
+ struct dlm_write_request32 *kb32)
+{
+ kb->version[0] = kb32->version[0];
+ kb->version[1] = kb32->version[1];
+ kb->version[2] = kb32->version[2];
+
+ kb->cmd = kb32->cmd;
+ kb->is64bit = kb32->is64bit;
+ if (kb->cmd == DLM_USER_CREATE_LOCKSPACE ||
+ kb->cmd == DLM_USER_REMOVE_LOCKSPACE) {
+ kb->i.lspace.flags = kb32->i.lspace.flags;
+ kb->i.lspace.minor = kb32->i.lspace.minor;
+ strcpy(kb->i.lspace.name, kb32->i.lspace.name);
+ } else {
+ kb->i.lock.mode = kb32->i.lock.mode;
+ kb->i.lock.namelen = kb32->i.lock.namelen;
+ kb->i.lock.flags = kb32->i.lock.flags;
+ kb->i.lock.lkid = kb32->i.lock.lkid;
+ kb->i.lock.parent = kb32->i.lock.parent;
+ kb->i.lock.castparam = (void *)(long)kb32->i.lock.castparam;
+ kb->i.lock.castaddr = (void *)(long)kb32->i.lock.castaddr;
+ kb->i.lock.bastparam = (void *)(long)kb32->i.lock.bastparam;
+ kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr;
+ kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb;
+ memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
+ memcpy(kb->i.lock.name, kb32->i.lock.name, kb->i.lock.namelen);
+ }
+}
+
+static void compat_output(struct dlm_lock_result *res,
+ struct dlm_lock_result32 *res32)
+{
+ res32->length = res->length - (sizeof(struct dlm_lock_result) -
+ sizeof(struct dlm_lock_result32));
+ res32->user_astaddr = (__u32)(long)res->user_astaddr;
+ res32->user_astparam = (__u32)(long)res->user_astparam;
+ res32->user_lksb = (__u32)(long)res->user_lksb;
+ res32->bast_mode = res->bast_mode;
+
+ res32->lvb_offset = res->lvb_offset;
+ res32->length = res->length;
+
+ res32->lksb.sb_status = res->lksb.sb_status;
+ res32->lksb.sb_flags = res->lksb.sb_flags;
+ res32->lksb.sb_lkid = res->lksb.sb_lkid;
+ res32->lksb.sb_lvbptr = (__u32)(long)res->lksb.sb_lvbptr;
+}
+#endif
+
+
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
+{
+ struct dlm_ls *ls;
+ struct dlm_user_args *ua;
+ struct dlm_user_proc *proc;
+ int remove_ownqueue = 0;
+
+ /* dlm_clear_proc_locks() sets ORPHAN/DEAD flag on each
+ lkb before dealing with it. We need to check this
+ flag before taking ls_clear_proc_locks mutex because if
+ it's set, dlm_clear_proc_locks() holds the mutex. */
+
+ if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
+ /* log_print("user_add_ast skip1 %x", lkb->lkb_flags); */
+ return;
+ }
+
+ ls = lkb->lkb_resource->res_ls;
+ mutex_lock(&ls->ls_clear_proc_locks);
+
+ /* If ORPHAN/DEAD flag is set, it means the process is dead so an ast
+ can't be delivered. For ORPHAN's, dlm_clear_proc_locks() freed
+ lkb->ua so we can't try to use it. */
+
+ if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
+ /* log_print("user_add_ast skip2 %x", lkb->lkb_flags); */
+ goto out;
+ }
+
+ DLM_ASSERT(lkb->lkb_astparam, dlm_print_lkb(lkb););
+ ua = (struct dlm_user_args *)lkb->lkb_astparam;
+ proc = ua->proc;
+
+ if (type == AST_BAST && ua->bastaddr == NULL)
+ goto out;
+
+ spin_lock(&proc->asts_spin);
+ if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
+ kref_get(&lkb->lkb_ref);
+ list_add_tail(&lkb->lkb_astqueue, &proc->asts);
+ lkb->lkb_ast_type |= type;
+ wake_up_interruptible(&proc->wait);
+ }
+
+ /* noqueue requests that fail may need to be removed from the
+ proc's locks list, there should be a better way of detecting
+ this situation than checking all these things... */
+
+ if (type == AST_COMP && lkb->lkb_grmode == DLM_LOCK_IV &&
+ ua->lksb.sb_status == -EAGAIN && !list_empty(&lkb->lkb_ownqueue))
+ remove_ownqueue = 1;
+
+ /* We want to copy the lvb to userspace when the completion
+ ast is read if the status is 0, the lock has an lvb and
+ lvb_ops says we should. We could probably have set_lvb_lock()
+ set update_user_lvb instead and not need old_mode */
+
+ if ((lkb->lkb_ast_type & AST_COMP) &&
+ (lkb->lkb_lksb->sb_status == 0) &&
+ lkb->lkb_lksb->sb_lvbptr &&
+ dlm_lvb_operations[ua->old_mode + 1][lkb->lkb_grmode + 1])
+ ua->update_user_lvb = 1;
+ else
+ ua->update_user_lvb = 0;
+
+ spin_unlock(&proc->asts_spin);
+
+ if (remove_ownqueue) {
+ spin_lock(&ua->proc->locks_spin);
+ list_del_init(&lkb->lkb_ownqueue);
+ spin_unlock(&ua->proc->locks_spin);
+ dlm_put_lkb(lkb);
+ }
+ out:
+ mutex_unlock(&ls->ls_clear_proc_locks);
+}
+
+static int device_user_lock(struct dlm_user_proc *proc,
+ struct dlm_lock_params *params)
+{
+ struct dlm_ls *ls;
+ struct dlm_user_args *ua;
+ int error = -ENOMEM;
+
+ ls = dlm_find_lockspace_local(proc->lockspace);
+ if (!ls)
+ return -ENOENT;
+
+ if (!params->castaddr || !params->lksb) {
+ error = -EINVAL;
+ goto out;
+ }
+
+ ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
+ if (!ua)
+ goto out;
+ ua->proc = proc;
+ ua->user_lksb = params->lksb;
+ ua->castparam = params->castparam;
+ ua->castaddr = params->castaddr;
+ ua->bastparam = params->bastparam;
+ ua->bastaddr = params->bastaddr;
+
+ if (params->flags & DLM_LKF_CONVERT)
+ error = dlm_user_convert(ls, ua,
+ params->mode, params->flags,
+ params->lkid, params->lvb);
+ else {
+ error = dlm_user_request(ls, ua,
+ params->mode, params->flags,
+ params->name, params->namelen,
+ params->parent);
+ if (!error)
+ error = ua->lksb.sb_lkid;
+ }
+ out:
+ dlm_put_lockspace(ls);
+ return error;
+}
+
+static int device_user_unlock(struct dlm_user_proc *proc,
+ struct dlm_lock_params *params)
+{
+ struct dlm_ls *ls;
+ struct dlm_user_args *ua;
+ int error = -ENOMEM;
+
+ ls = dlm_find_lockspace_local(proc->lockspace);
+ if (!ls)
+ return -ENOENT;
+
+ ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
+ if (!ua)
+ goto out;
+ ua->proc = proc;
+ ua->user_lksb = params->lksb;
+ ua->castparam = params->castparam;
+ ua->castaddr = params->castaddr;
+
+ if (params->flags & DLM_LKF_CANCEL)
+ error = dlm_user_cancel(ls, ua, params->flags, params->lkid);
+ else
+ error = dlm_user_unlock(ls, ua, params->flags, params->lkid,
+ params->lvb);
+ out:
+ dlm_put_lockspace(ls);
+ return error;
+}
+
+static int device_create_lockspace(struct dlm_lspace_params *params)
+{
+ dlm_lockspace_t *lockspace;
+ struct dlm_ls *ls;
+ int error, len;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ error = dlm_new_lockspace(params->name, strlen(params->name),
+ &lockspace, 0, DLM_USER_LVB_LEN);
+ if (error)
+ return error;
+
+ ls = dlm_find_lockspace_local(lockspace);
+ if (!ls)
+ return -ENOENT;
+
+ error = -ENOMEM;
+ len = strlen(params->name) + strlen(name_prefix) + 2;
+ ls->ls_device.name = kzalloc(len, GFP_KERNEL);
+ if (!ls->ls_device.name)
+ goto fail;
+ snprintf((char *)ls->ls_device.name, len, "%s_%s", name_prefix,
+ params->name);
+ ls->ls_device.fops = &device_fops;
+ ls->ls_device.minor = MISC_DYNAMIC_MINOR;
+
+ error = misc_register(&ls->ls_device);
+ if (error) {
+ kfree(ls->ls_device.name);
+ goto fail;
+ }
+
+ error = ls->ls_device.minor;
+ dlm_put_lockspace(ls);
+ return error;
+
+ fail:
+ dlm_put_lockspace(ls);
+ dlm_release_lockspace(lockspace, 0);
+ return error;
+}
+
+static int device_remove_lockspace(struct dlm_lspace_params *params)
+{
+ dlm_lockspace_t *lockspace;
+ struct dlm_ls *ls;
+ int error, force = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ls = dlm_find_lockspace_device(params->minor);
+ if (!ls)
+ return -ENOENT;
+
+ error = misc_deregister(&ls->ls_device);
+ if (error) {
+ dlm_put_lockspace(ls);
+ goto out;
+ }
+ kfree(ls->ls_device.name);
+
+ if (params->flags & DLM_USER_LSFLG_FORCEFREE)
+ force = 2;
+
+ lockspace = ls->ls_local_handle;
+
+ /* dlm_release_lockspace waits for references to go to zero,
+ so all processes will need to close their device for the ls
+ before the release will procede */
+
+ dlm_put_lockspace(ls);
+ error = dlm_release_lockspace(lockspace, force);
+ out:
+ return error;
+}
+
+/* Check the user's version matches ours */
+static int check_version(struct dlm_write_request *req)
+{
+ if (req->version[0] != DLM_DEVICE_VERSION_MAJOR ||
+ (req->version[0] == DLM_DEVICE_VERSION_MAJOR &&
+ req->version[1] > DLM_DEVICE_VERSION_MINOR)) {
+
+ printk(KERN_DEBUG "dlm: process %s (%d) version mismatch "
+ "user (%d.%d.%d) kernel (%d.%d.%d)\n",
+ current->comm,
+ current->pid,
+ req->version[0],
+ req->version[1],
+ req->version[2],
+ DLM_DEVICE_VERSION_MAJOR,
+ DLM_DEVICE_VERSION_MINOR,
+ DLM_DEVICE_VERSION_PATCH);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * device_write
+ *
+ * device_user_lock
+ * dlm_user_request -> request_lock
+ * dlm_user_convert -> convert_lock
+ *
+ * device_user_unlock
+ * dlm_user_unlock -> unlock_lock
+ * dlm_user_cancel -> cancel_lock
+ *
+ * device_create_lockspace
+ * dlm_new_lockspace
+ *
+ * device_remove_lockspace
+ * dlm_release_lockspace
+ */
+
+/* a write to a lockspace device is a lock or unlock request, a write
+ to the control device is to create/remove a lockspace */
+
+static ssize_t device_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct dlm_user_proc *proc = file->private_data;
+ struct dlm_write_request *kbuf;
+ sigset_t tmpsig, allsigs;
+ int error;
+
+#ifdef CONFIG_COMPAT
+ if (count < sizeof(struct dlm_write_request32))
+#else
+ if (count < sizeof(struct dlm_write_request))
+#endif
+ return -EINVAL;
+
+ kbuf = kmalloc(count, GFP_KERNEL);
+ if (!kbuf)
+ return -ENOMEM;
+
+ if (copy_from_user(kbuf, buf, count)) {
+ error = -EFAULT;
+ goto out_free;
+ }
+
+ if (check_version(kbuf)) {
+ error = -EBADE;
+ goto out_free;
+ }
+
+#ifdef CONFIG_COMPAT
+ if (!kbuf->is64bit) {
+ struct dlm_write_request32 *k32buf;
+ k32buf = (struct dlm_write_request32 *)kbuf;
+ kbuf = kmalloc(count + (sizeof(struct dlm_write_request) -
+ sizeof(struct dlm_write_request32)), GFP_KERNEL);
+ if (!kbuf)
+ return -ENOMEM;
+
+ if (proc)
+ set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
+ compat_input(kbuf, k32buf);
+ kfree(k32buf);
+ }
+#endif
+
+ /* do we really need this? can a write happen after a close? */
+ if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) &&
+ test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
+ return -EINVAL;
+
+ sigfillset(&allsigs);
+ sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
+
+ error = -EINVAL;
+
+ switch (kbuf->cmd)
+ {
+ case DLM_USER_LOCK:
+ if (!proc) {
+ log_print("no locking on control device");
+ goto out_sig;
+ }
+ error = device_user_lock(proc, &kbuf->i.lock);
+ break;
+
+ case DLM_USER_UNLOCK:
+ if (!proc) {
+ log_print("no locking on control device");
+ goto out_sig;
+ }
+ error = device_user_unlock(proc, &kbuf->i.lock);
+ break;
+
+ case DLM_USER_CREATE_LOCKSPACE:
+ if (proc) {
+ log_print("create/remove only on control device");
+ goto out_sig;
+ }
+ error = device_create_lockspace(&kbuf->i.lspace);
+ break;
+
+ case DLM_USER_REMOVE_LOCKSPACE:
+ if (proc) {
+ log_print("create/remove only on control device");
+ goto out_sig;
+ }
+ error = device_remove_lockspace(&kbuf->i.lspace);
+ break;
+
+ default:
+ log_print("Unknown command passed to DLM device : %d\n",
+ kbuf->cmd);
+ }
+
+ out_sig:
+ sigprocmask(SIG_SETMASK, &tmpsig, NULL);
+ recalc_sigpending();
+ out_free:
+ kfree(kbuf);
+ return error;
+}
+
+/* Every process that opens the lockspace device has its own "proc" structure
+ hanging off the open file that's used to keep track of locks owned by the
+ process and asts that need to be delivered to the process. */
+
+static int device_open(struct inode *inode, struct file *file)
+{
+ struct dlm_user_proc *proc;
+ struct dlm_ls *ls;
+
+ ls = dlm_find_lockspace_device(iminor(inode));
+ if (!ls)
+ return -ENOENT;
+
+ proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
+ if (!proc) {
+ dlm_put_lockspace(ls);
+ return -ENOMEM;
+ }
+
+ proc->lockspace = ls->ls_local_handle;
+ INIT_LIST_HEAD(&proc->asts);
+ INIT_LIST_HEAD(&proc->locks);
+ spin_lock_init(&proc->asts_spin);
+ spin_lock_init(&proc->locks_spin);
+ init_waitqueue_head(&proc->wait);
+ file->private_data = proc;
+
+ return 0;
+}
+
+static int device_close(struct inode *inode, struct file *file)
+{
+ struct dlm_user_proc *proc = file->private_data;
+ struct dlm_ls *ls;
+ sigset_t tmpsig, allsigs;
+
+ ls = dlm_find_lockspace_local(proc->lockspace);
+ if (!ls)
+ return -ENOENT;
+
+ sigfillset(&allsigs);
+ sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
+
+ set_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags);
+
+ dlm_clear_proc_locks(ls, proc);
+
+ /* at this point no more lkb's should exist for this lockspace,
+ so there's no chance of dlm_user_add_ast() being called and
+ looking for lkb->ua->proc */
+
+ kfree(proc);
+ file->private_data = NULL;
+
+ dlm_put_lockspace(ls);
+ dlm_put_lockspace(ls); /* for the find in device_open() */
+
+ /* FIXME: AUTOFREE: if this ls is no longer used do
+ device_remove_lockspace() */
+
+ sigprocmask(SIG_SETMASK, &tmpsig, NULL);
+ recalc_sigpending();
+
+ return 0;
+}
+
+static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
+ int bmode, char __user *buf, size_t count)
+{
+#ifdef CONFIG_COMPAT
+ struct dlm_lock_result32 result32;
+#endif
+ struct dlm_lock_result result;
+ void *resultptr;
+ int error=0;
+ int len;
+ int struct_len;
+
+ memset(&result, 0, sizeof(struct dlm_lock_result));
+ memcpy(&result.lksb, &ua->lksb, sizeof(struct dlm_lksb));
+ result.user_lksb = ua->user_lksb;
+
+ /* FIXME: dlm1 provides for the user's bastparam/addr to not be updated
+ in a conversion unless the conversion is successful. See code
+ in dlm_user_convert() for updating ua from ua_tmp. OpenVMS, though,
+ notes that a new blocking AST address and parameter are set even if
+ the conversion fails, so maybe we should just do that. */
+
+ if (type == AST_BAST) {
+ result.user_astaddr = ua->bastaddr;
+ result.user_astparam = ua->bastparam;
+ result.bast_mode = bmode;
+ } else {
+ result.user_astaddr = ua->castaddr;
+ result.user_astparam = ua->castparam;
+ }
+
+#ifdef CONFIG_COMPAT
+ if (compat)
+ len = sizeof(struct dlm_lock_result32);
+ else
+#endif
+ len = sizeof(struct dlm_lock_result);
+ struct_len = len;
+
+ /* copy lvb to userspace if there is one, it's been updated, and
+ the user buffer has space for it */
+
+ if (ua->update_user_lvb && ua->lksb.sb_lvbptr &&
+ count >= len + DLM_USER_LVB_LEN) {
+ if (copy_to_user(buf+len, ua->lksb.sb_lvbptr,
+ DLM_USER_LVB_LEN)) {
+ error = -EFAULT;
+ goto out;
+ }
+
+ result.lvb_offset = len;
+ len += DLM_USER_LVB_LEN;
+ }
+
+ result.length = len;
+ resultptr = &result;
+#ifdef CONFIG_COMPAT
+ if (compat) {
+ compat_output(&result, &result32);
+ resultptr = &result32;
+ }
+#endif
+
+ if (copy_to_user(buf, resultptr, struct_len))
+ error = -EFAULT;
+ else
+ error = len;
+ out:
+ return error;
+}
+
+/* a read returns a single ast described in a struct dlm_lock_result */
+
+static ssize_t device_read(struct file *file, char __user *buf, size_t count,
+ loff_t *ppos)
+{
+ struct dlm_user_proc *proc = file->private_data;
+ struct dlm_lkb *lkb;
+ struct dlm_user_args *ua;
+ DECLARE_WAITQUEUE(wait, current);
+ int error, type=0, bmode=0, removed = 0;
+
+#ifdef CONFIG_COMPAT
+ if (count < sizeof(struct dlm_lock_result32))
+#else
+ if (count < sizeof(struct dlm_lock_result))
+#endif
+ return -EINVAL;
+
+ /* do we really need this? can a read happen after a close? */
+ if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
+ return -EINVAL;
+
+ spin_lock(&proc->asts_spin);
+ if (list_empty(&proc->asts)) {
+ if (file->f_flags & O_NONBLOCK) {
+ spin_unlock(&proc->asts_spin);
+ return -EAGAIN;
+ }
+
+ add_wait_queue(&proc->wait, &wait);
+
+ repeat:
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (list_empty(&proc->asts) && !signal_pending(current)) {
+ spin_unlock(&proc->asts_spin);
+ schedule();
+ spin_lock(&proc->asts_spin);
+ goto repeat;
+ }
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(&proc->wait, &wait);
+
+ if (signal_pending(current)) {
+ spin_unlock(&proc->asts_spin);
+ return -ERESTARTSYS;
+ }
+ }
+
+ if (list_empty(&proc->asts)) {
+ spin_unlock(&proc->asts_spin);
+ return -EAGAIN;
+ }
+
+ /* there may be both completion and blocking asts to return for
+ the lkb, don't remove lkb from asts list unless no asts remain */
+
+ lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
+
+ if (lkb->lkb_ast_type & AST_COMP) {
+ lkb->lkb_ast_type &= ~AST_COMP;
+ type = AST_COMP;
+ } else if (lkb->lkb_ast_type & AST_BAST) {
+ lkb->lkb_ast_type &= ~AST_BAST;
+ type = AST_BAST;
+ bmode = lkb->lkb_bastmode;
+ }
+
+ if (!lkb->lkb_ast_type) {
+ list_del(&lkb->lkb_astqueue);
+ removed = 1;
+ }
+ spin_unlock(&proc->asts_spin);
+
+ ua = (struct dlm_user_args *)lkb->lkb_astparam;
+ error = copy_result_to_user(ua,
+ test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
+ type, bmode, buf, count);
+
+ /* removes reference for the proc->asts lists added by
+ dlm_user_add_ast() and may result in the lkb being freed */
+ if (removed)
+ dlm_put_lkb(lkb);
+
+ return error;
+}
+
+static unsigned int device_poll(struct file *file, poll_table *wait)
+{
+ struct dlm_user_proc *proc = file->private_data;
+
+ poll_wait(file, &proc->wait, wait);
+
+ spin_lock(&proc->asts_spin);
+ if (!list_empty(&proc->asts)) {
+ spin_unlock(&proc->asts_spin);
+ return POLLIN | POLLRDNORM;
+ }
+ spin_unlock(&proc->asts_spin);
+ return 0;
+}
+
+static int ctl_device_open(struct inode *inode, struct file *file)
+{
+ file->private_data = NULL;
+ return 0;
+}
+
+static int ctl_device_close(struct inode *inode, struct file *file)
+{
+ return 0;
+}
+
+static struct file_operations device_fops = {
+ .open = device_open,
+ .release = device_close,
+ .read = device_read,
+ .write = device_write,
+ .poll = device_poll,
+ .owner = THIS_MODULE,
+};
+
+static struct file_operations ctl_device_fops = {
+ .open = ctl_device_open,
+ .release = ctl_device_close,
+ .write = device_write,
+ .owner = THIS_MODULE,
+};
+
+int dlm_user_init(void)
+{
+ int error;
+
+ ctl_device.name = "dlm-control";
+ ctl_device.fops = &ctl_device_fops;
+ ctl_device.minor = MISC_DYNAMIC_MINOR;
+
+ error = misc_register(&ctl_device);
+ if (error)
+ log_print("misc_register failed for control device");
+
+ return error;
+}
+
+void dlm_user_exit(void)
+{
+ misc_deregister(&ctl_device);
+}
+
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
new file mode 100644
index 000000000000..d38e9f3e4151
--- /dev/null
+++ b/fs/dlm/user.h
@@ -0,0 +1,16 @@
+/*
+ * Copyright (C) 2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#ifndef __USER_DOT_H__
+#define __USER_DOT_H__
+
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type);
+int dlm_user_init(void);
+void dlm_user_exit(void);
+
+#endif
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
new file mode 100644
index 000000000000..767197db9944
--- /dev/null
+++ b/fs/dlm/util.c
@@ -0,0 +1,161 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "rcom.h"
+#include "util.h"
+
+static void header_out(struct dlm_header *hd)
+{
+ hd->h_version = cpu_to_le32(hd->h_version);
+ hd->h_lockspace = cpu_to_le32(hd->h_lockspace);
+ hd->h_nodeid = cpu_to_le32(hd->h_nodeid);
+ hd->h_length = cpu_to_le16(hd->h_length);
+}
+
+static void header_in(struct dlm_header *hd)
+{
+ hd->h_version = le32_to_cpu(hd->h_version);
+ hd->h_lockspace = le32_to_cpu(hd->h_lockspace);
+ hd->h_nodeid = le32_to_cpu(hd->h_nodeid);
+ hd->h_length = le16_to_cpu(hd->h_length);
+}
+
+void dlm_message_out(struct dlm_message *ms)
+{
+ struct dlm_header *hd = (struct dlm_header *) ms;
+
+ header_out(hd);
+
+ ms->m_type = cpu_to_le32(ms->m_type);
+ ms->m_nodeid = cpu_to_le32(ms->m_nodeid);
+ ms->m_pid = cpu_to_le32(ms->m_pid);
+ ms->m_lkid = cpu_to_le32(ms->m_lkid);
+ ms->m_remid = cpu_to_le32(ms->m_remid);
+ ms->m_parent_lkid = cpu_to_le32(ms->m_parent_lkid);
+ ms->m_parent_remid = cpu_to_le32(ms->m_parent_remid);
+ ms->m_exflags = cpu_to_le32(ms->m_exflags);
+ ms->m_sbflags = cpu_to_le32(ms->m_sbflags);
+ ms->m_flags = cpu_to_le32(ms->m_flags);
+ ms->m_lvbseq = cpu_to_le32(ms->m_lvbseq);
+ ms->m_hash = cpu_to_le32(ms->m_hash);
+ ms->m_status = cpu_to_le32(ms->m_status);
+ ms->m_grmode = cpu_to_le32(ms->m_grmode);
+ ms->m_rqmode = cpu_to_le32(ms->m_rqmode);
+ ms->m_bastmode = cpu_to_le32(ms->m_bastmode);
+ ms->m_asts = cpu_to_le32(ms->m_asts);
+ ms->m_result = cpu_to_le32(ms->m_result);
+}
+
+void dlm_message_in(struct dlm_message *ms)
+{
+ struct dlm_header *hd = (struct dlm_header *) ms;
+
+ header_in(hd);
+
+ ms->m_type = le32_to_cpu(ms->m_type);
+ ms->m_nodeid = le32_to_cpu(ms->m_nodeid);
+ ms->m_pid = le32_to_cpu(ms->m_pid);
+ ms->m_lkid = le32_to_cpu(ms->m_lkid);
+ ms->m_remid = le32_to_cpu(ms->m_remid);
+ ms->m_parent_lkid = le32_to_cpu(ms->m_parent_lkid);
+ ms->m_parent_remid = le32_to_cpu(ms->m_parent_remid);
+ ms->m_exflags = le32_to_cpu(ms->m_exflags);
+ ms->m_sbflags = le32_to_cpu(ms->m_sbflags);
+ ms->m_flags = le32_to_cpu(ms->m_flags);
+ ms->m_lvbseq = le32_to_cpu(ms->m_lvbseq);
+ ms->m_hash = le32_to_cpu(ms->m_hash);
+ ms->m_status = le32_to_cpu(ms->m_status);
+ ms->m_grmode = le32_to_cpu(ms->m_grmode);
+ ms->m_rqmode = le32_to_cpu(ms->m_rqmode);
+ ms->m_bastmode = le32_to_cpu(ms->m_bastmode);
+ ms->m_asts = le32_to_cpu(ms->m_asts);
+ ms->m_result = le32_to_cpu(ms->m_result);
+}
+
+static void rcom_lock_out(struct rcom_lock *rl)
+{
+ rl->rl_ownpid = cpu_to_le32(rl->rl_ownpid);
+ rl->rl_lkid = cpu_to_le32(rl->rl_lkid);
+ rl->rl_remid = cpu_to_le32(rl->rl_remid);
+ rl->rl_parent_lkid = cpu_to_le32(rl->rl_parent_lkid);
+ rl->rl_parent_remid = cpu_to_le32(rl->rl_parent_remid);
+ rl->rl_exflags = cpu_to_le32(rl->rl_exflags);
+ rl->rl_flags = cpu_to_le32(rl->rl_flags);
+ rl->rl_lvbseq = cpu_to_le32(rl->rl_lvbseq);
+ rl->rl_result = cpu_to_le32(rl->rl_result);
+ rl->rl_wait_type = cpu_to_le16(rl->rl_wait_type);
+ rl->rl_namelen = cpu_to_le16(rl->rl_namelen);
+}
+
+static void rcom_lock_in(struct rcom_lock *rl)
+{
+ rl->rl_ownpid = le32_to_cpu(rl->rl_ownpid);
+ rl->rl_lkid = le32_to_cpu(rl->rl_lkid);
+ rl->rl_remid = le32_to_cpu(rl->rl_remid);
+ rl->rl_parent_lkid = le32_to_cpu(rl->rl_parent_lkid);
+ rl->rl_parent_remid = le32_to_cpu(rl->rl_parent_remid);
+ rl->rl_exflags = le32_to_cpu(rl->rl_exflags);
+ rl->rl_flags = le32_to_cpu(rl->rl_flags);
+ rl->rl_lvbseq = le32_to_cpu(rl->rl_lvbseq);
+ rl->rl_result = le32_to_cpu(rl->rl_result);
+ rl->rl_wait_type = le16_to_cpu(rl->rl_wait_type);
+ rl->rl_namelen = le16_to_cpu(rl->rl_namelen);
+}
+
+static void rcom_config_out(struct rcom_config *rf)
+{
+ rf->rf_lvblen = cpu_to_le32(rf->rf_lvblen);
+ rf->rf_lsflags = cpu_to_le32(rf->rf_lsflags);
+}
+
+static void rcom_config_in(struct rcom_config *rf)
+{
+ rf->rf_lvblen = le32_to_cpu(rf->rf_lvblen);
+ rf->rf_lsflags = le32_to_cpu(rf->rf_lsflags);
+}
+
+void dlm_rcom_out(struct dlm_rcom *rc)
+{
+ struct dlm_header *hd = (struct dlm_header *) rc;
+ int type = rc->rc_type;
+
+ header_out(hd);
+
+ rc->rc_type = cpu_to_le32(rc->rc_type);
+ rc->rc_result = cpu_to_le32(rc->rc_result);
+ rc->rc_id = cpu_to_le64(rc->rc_id);
+
+ if (type == DLM_RCOM_LOCK)
+ rcom_lock_out((struct rcom_lock *) rc->rc_buf);
+
+ else if (type == DLM_RCOM_STATUS_REPLY)
+ rcom_config_out((struct rcom_config *) rc->rc_buf);
+}
+
+void dlm_rcom_in(struct dlm_rcom *rc)
+{
+ struct dlm_header *hd = (struct dlm_header *) rc;
+
+ header_in(hd);
+
+ rc->rc_type = le32_to_cpu(rc->rc_type);
+ rc->rc_result = le32_to_cpu(rc->rc_result);
+ rc->rc_id = le64_to_cpu(rc->rc_id);
+
+ if (rc->rc_type == DLM_RCOM_LOCK)
+ rcom_lock_in((struct rcom_lock *) rc->rc_buf);
+
+ else if (rc->rc_type == DLM_RCOM_STATUS_REPLY)
+ rcom_config_in((struct rcom_config *) rc->rc_buf);
+}
+
diff --git a/fs/dlm/util.h b/fs/dlm/util.h
new file mode 100644
index 000000000000..2b25915161c0
--- /dev/null
+++ b/fs/dlm/util.h
@@ -0,0 +1,22 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __UTIL_DOT_H__
+#define __UTIL_DOT_H__
+
+void dlm_message_out(struct dlm_message *ms);
+void dlm_message_in(struct dlm_message *ms);
+void dlm_rcom_out(struct dlm_rcom *rc);
+void dlm_rcom_in(struct dlm_rcom *rc);
+
+#endif
+
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
new file mode 100644
index 000000000000..8c27de8b9568
--- /dev/null
+++ b/fs/gfs2/Kconfig
@@ -0,0 +1,44 @@
+config GFS2_FS
+ tristate "GFS2 file system support"
+ depends on EXPERIMENTAL
+ select FS_POSIX_ACL
+ help
+ A cluster filesystem.
+
+ Allows a cluster of computers to simultaneously use a block device
+ that is shared between them (with FC, iSCSI, NBD, etc...). GFS reads
+ and writes to the block device like a local filesystem, but also uses
+ a lock module to allow the computers coordinate their I/O so
+ filesystem consistency is maintained. One of the nifty features of
+ GFS is perfect consistency -- changes made to the filesystem on one
+ machine show up immediately on all other machines in the cluster.
+
+ To use the GFS2 filesystem, you will need to enable one or more of
+ the below locking modules. Documentation and utilities for GFS2 can
+ be found here: http://sources.redhat.com/cluster
+
+config GFS2_FS_LOCKING_NOLOCK
+ tristate "GFS2 \"nolock\" locking module"
+ depends on GFS2_FS
+ help
+ Single node locking module for GFS2.
+
+ Use this module if you want to use GFS2 on a single node without
+ its clustering features. You can still take advantage of the
+ large file support, and upgrade to running a full cluster later on
+ if required.
+
+ If you will only be using GFS2 in cluster mode, you do not need this
+ module.
+
+config GFS2_FS_LOCKING_DLM
+ tristate "GFS2 DLM locking module"
+ depends on GFS2_FS
+ select DLM
+ help
+ Multiple node locking module for GFS2
+
+ Most users of GFS2 will require this module. It provides the locking
+ interface between GFS2 and the DLM, which is required to use GFS2
+ in a cluster environment.
+
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
new file mode 100644
index 000000000000..e3f1ada643ac
--- /dev/null
+++ b/fs/gfs2/Makefile
@@ -0,0 +1,10 @@
+obj-$(CONFIG_GFS2_FS) += gfs2.o
+gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
+ glops.o inode.o lm.o log.o lops.o locking.o main.o meta_io.o \
+ mount.o ondisk.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
+ ops_fstype.o ops_inode.o ops_super.o ops_vm.o quota.o \
+ recovery.o rgrp.o super.o sys.o trans.o util.o
+
+obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
+obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/
+
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
new file mode 100644
index 000000000000..5f959b8ce406
--- /dev/null
+++ b/fs/gfs2/acl.c
@@ -0,0 +1,309 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "eaops.h"
+#include "eattr.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "trans.h"
+#include "util.h"
+
+#define ACL_ACCESS 1
+#define ACL_DEFAULT 0
+
+int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
+ struct gfs2_ea_request *er,
+ int *remove, mode_t *mode)
+{
+ struct posix_acl *acl;
+ int error;
+
+ error = gfs2_acl_validate_remove(ip, access);
+ if (error)
+ return error;
+
+ if (!er->er_data)
+ return -EINVAL;
+
+ acl = posix_acl_from_xattr(er->er_data, er->er_data_len);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ if (!acl) {
+ *remove = 1;
+ return 0;
+ }
+
+ error = posix_acl_valid(acl);
+ if (error)
+ goto out;
+
+ if (access) {
+ error = posix_acl_equiv_mode(acl, mode);
+ if (!error)
+ *remove = 1;
+ else if (error > 0)
+ error = 0;
+ }
+
+out:
+ posix_acl_release(acl);
+ return error;
+}
+
+int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
+{
+ if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl)
+ return -EOPNOTSUPP;
+ if (current->fsuid != ip->i_di.di_uid && !capable(CAP_FOWNER))
+ return -EPERM;
+ if (S_ISLNK(ip->i_di.di_mode))
+ return -EOPNOTSUPP;
+ if (!access && !S_ISDIR(ip->i_di.di_mode))
+ return -EACCES;
+
+ return 0;
+}
+
+static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
+ struct gfs2_ea_location *el, char **data, unsigned int *len)
+{
+ struct gfs2_ea_request er;
+ struct gfs2_ea_location el_this;
+ int error;
+
+ if (!ip->i_di.di_eattr)
+ return 0;
+
+ memset(&er, 0, sizeof(struct gfs2_ea_request));
+ if (access) {
+ er.er_name = GFS2_POSIX_ACL_ACCESS;
+ er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
+ } else {
+ er.er_name = GFS2_POSIX_ACL_DEFAULT;
+ er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
+ }
+ er.er_type = GFS2_EATYPE_SYS;
+
+ if (!el)
+ el = &el_this;
+
+ error = gfs2_ea_find(ip, &er, el);
+ if (error)
+ return error;
+ if (!el->el_ea)
+ return 0;
+ if (!GFS2_EA_DATA_LEN(el->el_ea))
+ goto out;
+
+ er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea);
+ er.er_data = kmalloc(er.er_data_len, GFP_KERNEL);
+ error = -ENOMEM;
+ if (!er.er_data)
+ goto out;
+
+ error = gfs2_ea_get_copy(ip, el, er.er_data);
+ if (error)
+ goto out_kfree;
+
+ if (acl) {
+ *acl = posix_acl_from_xattr(er.er_data, er.er_data_len);
+ if (IS_ERR(*acl))
+ error = PTR_ERR(*acl);
+ }
+
+out_kfree:
+ if (error || !data)
+ kfree(er.er_data);
+ else {
+ *data = er.er_data;
+ *len = er.er_data_len;
+ }
+out:
+ if (error || el == &el_this)
+ brelse(el->el_bh);
+ return error;
+}
+
+/**
+ * gfs2_check_acl_locked - Check an ACL to see if we're allowed to do something
+ * @inode: the file we want to do something to
+ * @mask: what we want to do
+ *
+ * Returns: errno
+ */
+
+int gfs2_check_acl_locked(struct inode *inode, int mask)
+{
+ struct posix_acl *acl = NULL;
+ int error;
+
+ error = acl_get(GFS2_I(inode), ACL_ACCESS, &acl, NULL, NULL, NULL);
+ if (error)
+ return error;
+
+ if (acl) {
+ error = posix_acl_permission(inode, acl, mask);
+ posix_acl_release(acl);
+ return error;
+ }
+
+ return -EAGAIN;
+}
+
+int gfs2_check_acl(struct inode *inode, int mask)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder i_gh;
+ int error;
+
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+ if (!error) {
+ error = gfs2_check_acl_locked(inode, mask);
+ gfs2_glock_dq_uninit(&i_gh);
+ }
+
+ return error;
+}
+
+static int munge_mode(struct gfs2_inode *ip, mode_t mode)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct buffer_head *dibh;
+ int error;
+
+ error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+ if (error)
+ return error;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (!error) {
+ gfs2_assert_withdraw(sdp,
+ (ip->i_di.di_mode & S_IFMT) == (mode & S_IFMT));
+ ip->i_di.di_mode = mode;
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+ }
+
+ gfs2_trans_end(sdp);
+
+ return 0;
+}
+
+int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+ struct posix_acl *acl = NULL, *clone;
+ struct gfs2_ea_request er;
+ mode_t mode = ip->i_di.di_mode;
+ int error;
+
+ if (!sdp->sd_args.ar_posix_acl)
+ return 0;
+ if (S_ISLNK(ip->i_di.di_mode))
+ return 0;
+
+ memset(&er, 0, sizeof(struct gfs2_ea_request));
+ er.er_type = GFS2_EATYPE_SYS;
+
+ error = acl_get(dip, ACL_DEFAULT, &acl, NULL,
+ &er.er_data, &er.er_data_len);
+ if (error)
+ return error;
+ if (!acl) {
+ mode &= ~current->fs->umask;
+ if (mode != ip->i_di.di_mode)
+ error = munge_mode(ip, mode);
+ return error;
+ }
+
+ clone = posix_acl_clone(acl, GFP_KERNEL);
+ error = -ENOMEM;
+ if (!clone)
+ goto out;
+ posix_acl_release(acl);
+ acl = clone;
+
+ if (S_ISDIR(ip->i_di.di_mode)) {
+ er.er_name = GFS2_POSIX_ACL_DEFAULT;
+ er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
+ error = gfs2_system_eaops.eo_set(ip, &er);
+ if (error)
+ goto out;
+ }
+
+ error = posix_acl_create_masq(acl, &mode);
+ if (error < 0)
+ goto out;
+ if (error > 0) {
+ er.er_name = GFS2_POSIX_ACL_ACCESS;
+ er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
+ posix_acl_to_xattr(acl, er.er_data, er.er_data_len);
+ er.er_mode = mode;
+ er.er_flags = GFS2_ERF_MODE;
+ error = gfs2_system_eaops.eo_set(ip, &er);
+ if (error)
+ goto out;
+ } else
+ munge_mode(ip, mode);
+
+out:
+ posix_acl_release(acl);
+ kfree(er.er_data);
+ return error;
+}
+
+int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
+{
+ struct posix_acl *acl = NULL, *clone;
+ struct gfs2_ea_location el;
+ char *data;
+ unsigned int len;
+ int error;
+
+ error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len);
+ if (error)
+ return error;
+ if (!acl)
+ return gfs2_setattr_simple(ip, attr);
+
+ clone = posix_acl_clone(acl, GFP_KERNEL);
+ error = -ENOMEM;
+ if (!clone)
+ goto out;
+ posix_acl_release(acl);
+ acl = clone;
+
+ error = posix_acl_chmod_masq(acl, attr->ia_mode);
+ if (!error) {
+ posix_acl_to_xattr(acl, data, len);
+ error = gfs2_ea_acl_chmod(ip, &el, attr, data);
+ }
+
+out:
+ posix_acl_release(acl);
+ brelse(el.el_bh);
+ kfree(data);
+ return error;
+}
+
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
new file mode 100644
index 000000000000..05c294fe0d78
--- /dev/null
+++ b/fs/gfs2/acl.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __ACL_DOT_H__
+#define __ACL_DOT_H__
+
+#include "incore.h"
+
+#define GFS2_POSIX_ACL_ACCESS "posix_acl_access"
+#define GFS2_POSIX_ACL_ACCESS_LEN 16
+#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default"
+#define GFS2_POSIX_ACL_DEFAULT_LEN 17
+
+#define GFS2_ACL_IS_ACCESS(name, len) \
+ ((len) == GFS2_POSIX_ACL_ACCESS_LEN && \
+ !memcmp(GFS2_POSIX_ACL_ACCESS, (name), (len)))
+
+#define GFS2_ACL_IS_DEFAULT(name, len) \
+ ((len) == GFS2_POSIX_ACL_DEFAULT_LEN && \
+ !memcmp(GFS2_POSIX_ACL_DEFAULT, (name), (len)))
+
+struct gfs2_ea_request;
+
+int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
+ struct gfs2_ea_request *er,
+ int *remove, mode_t *mode);
+int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access);
+int gfs2_check_acl_locked(struct inode *inode, int mask);
+int gfs2_check_acl(struct inode *inode, int mask);
+int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip);
+int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
+
+#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
new file mode 100644
index 000000000000..cc57f2ecd219
--- /dev/null
+++ b/fs/gfs2/bmap.c
@@ -0,0 +1,1221 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "dir.h"
+#include "util.h"
+#include "ops_address.h"
+
+/* This doesn't need to be that large as max 64 bit pointers in a 4k
+ * block is 512, so __u16 is fine for that. It saves stack space to
+ * keep it small.
+ */
+struct metapath {
+ __u16 mp_list[GFS2_MAX_META_HEIGHT];
+};
+
+typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh,
+ struct buffer_head *bh, u64 *top,
+ u64 *bottom, unsigned int height,
+ void *data);
+
+struct strip_mine {
+ int sm_first;
+ unsigned int sm_height;
+};
+
+/**
+ * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @block: the block number that was allocated
+ * @private: any locked page held by the caller process
+ *
+ * Returns: errno
+ */
+
+static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
+ u64 block, struct page *page)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct inode *inode = &ip->i_inode;
+ struct buffer_head *bh;
+ int release = 0;
+
+ if (!page || page->index) {
+ page = grab_cache_page(inode->i_mapping, 0);
+ if (!page)
+ return -ENOMEM;
+ release = 1;
+ }
+
+ if (!PageUptodate(page)) {
+ void *kaddr = kmap(page);
+
+ memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
+ ip->i_di.di_size);
+ memset(kaddr + ip->i_di.di_size, 0,
+ PAGE_CACHE_SIZE - ip->i_di.di_size);
+ kunmap(page);
+
+ SetPageUptodate(page);
+ }
+
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, 1 << inode->i_blkbits,
+ (1 << BH_Uptodate));
+
+ bh = page_buffers(page);
+
+ if (!buffer_mapped(bh))
+ map_bh(bh, inode->i_sb, block);
+
+ set_buffer_uptodate(bh);
+ if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
+ gfs2_trans_add_bh(ip->i_gl, bh, 0);
+ mark_buffer_dirty(bh);
+
+ if (release) {
+ unlock_page(page);
+ page_cache_release(page);
+ }
+
+ return 0;
+}
+
+/**
+ * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
+ * @ip: The GFS2 inode to unstuff
+ * @unstuffer: the routine that handles unstuffing a non-zero length file
+ * @private: private data for the unstuffer
+ *
+ * This routine unstuffs a dinode and returns it to a "normal" state such
+ * that the height can be grown in the traditional way.
+ *
+ * Returns: errno
+ */
+
+int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
+{
+ struct buffer_head *bh, *dibh;
+ struct gfs2_dinode *di;
+ u64 block = 0;
+ int isdir = gfs2_is_dir(ip);
+ int error;
+
+ down_write(&ip->i_rw_mutex);
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ goto out;
+
+ if (ip->i_di.di_size) {
+ /* Get a free block, fill it with the stuffed data,
+ and write it out to disk */
+
+ if (isdir) {
+ block = gfs2_alloc_meta(ip);
+
+ error = gfs2_dir_get_new_buffer(ip, block, &bh);
+ if (error)
+ goto out_brelse;
+ gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
+ dibh, sizeof(struct gfs2_dinode));
+ brelse(bh);
+ } else {
+ block = gfs2_alloc_data(ip);
+
+ error = gfs2_unstuffer_page(ip, dibh, block, page);
+ if (error)
+ goto out_brelse;
+ }
+ }
+
+ /* Set up the pointer to the new block */
+
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ di = (struct gfs2_dinode *)dibh->b_data;
+ gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+
+ if (ip->i_di.di_size) {
+ *(__be64 *)(di + 1) = cpu_to_be64(block);
+ ip->i_di.di_blocks++;
+ di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
+ }
+
+ ip->i_di.di_height = 1;
+ di->di_height = cpu_to_be16(1);
+
+out_brelse:
+ brelse(dibh);
+out:
+ up_write(&ip->i_rw_mutex);
+ return error;
+}
+
+/**
+ * calc_tree_height - Calculate the height of a metadata tree
+ * @ip: The GFS2 inode
+ * @size: The proposed size of the file
+ *
+ * Work out how tall a metadata tree needs to be in order to accommodate a
+ * file of a particular size. If size is less than the current size of
+ * the inode, then the current size of the inode is used instead of the
+ * supplied one.
+ *
+ * Returns: the height the tree should be
+ */
+
+static unsigned int calc_tree_height(struct gfs2_inode *ip, u64 size)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ u64 *arr;
+ unsigned int max, height;
+
+ if (ip->i_di.di_size > size)
+ size = ip->i_di.di_size;
+
+ if (gfs2_is_dir(ip)) {
+ arr = sdp->sd_jheightsize;
+ max = sdp->sd_max_jheight;
+ } else {
+ arr = sdp->sd_heightsize;
+ max = sdp->sd_max_height;
+ }
+
+ for (height = 0; height < max; height++)
+ if (arr[height] >= size)
+ break;
+
+ return height;
+}
+
+/**
+ * build_height - Build a metadata tree of the requested height
+ * @ip: The GFS2 inode
+ * @height: The height to build to
+ *
+ *
+ * Returns: errno
+ */
+
+static int build_height(struct inode *inode, unsigned height)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ unsigned new_height = height - ip->i_di.di_height;
+ struct buffer_head *dibh;
+ struct buffer_head *blocks[GFS2_MAX_META_HEIGHT];
+ struct gfs2_dinode *di;
+ int error;
+ u64 *bp;
+ u64 bn;
+ unsigned n;
+
+ if (height <= ip->i_di.di_height)
+ return 0;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ return error;
+
+ for(n = 0; n < new_height; n++) {
+ bn = gfs2_alloc_meta(ip);
+ blocks[n] = gfs2_meta_new(ip->i_gl, bn);
+ gfs2_trans_add_bh(ip->i_gl, blocks[n], 1);
+ }
+
+ n = 0;
+ bn = blocks[0]->b_blocknr;
+ if (new_height > 1) {
+ for(; n < new_height-1; n++) {
+ gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN,
+ GFS2_FORMAT_IN);
+ gfs2_buffer_clear_tail(blocks[n],
+ sizeof(struct gfs2_meta_header));
+ bp = (u64 *)(blocks[n]->b_data +
+ sizeof(struct gfs2_meta_header));
+ *bp = cpu_to_be64(blocks[n+1]->b_blocknr);
+ brelse(blocks[n]);
+ blocks[n] = NULL;
+ }
+ }
+ gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+ gfs2_buffer_copy_tail(blocks[n], sizeof(struct gfs2_meta_header),
+ dibh, sizeof(struct gfs2_dinode));
+ brelse(blocks[n]);
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ di = (struct gfs2_dinode *)dibh->b_data;
+ gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+ *(__be64 *)(di + 1) = cpu_to_be64(bn);
+ ip->i_di.di_height += new_height;
+ ip->i_di.di_blocks += new_height;
+ di->di_height = cpu_to_be16(ip->i_di.di_height);
+ di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
+ brelse(dibh);
+ return error;
+}
+
+/**
+ * find_metapath - Find path through the metadata tree
+ * @ip: The inode pointer
+ * @mp: The metapath to return the result in
+ * @block: The disk block to look up
+ *
+ * This routine returns a struct metapath structure that defines a path
+ * through the metadata of inode "ip" to get to block "block".
+ *
+ * Example:
+ * Given: "ip" is a height 3 file, "offset" is 101342453, and this is a
+ * filesystem with a blocksize of 4096.
+ *
+ * find_metapath() would return a struct metapath structure set to:
+ * mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
+ * and mp_list[2] = 165.
+ *
+ * That means that in order to get to the block containing the byte at
+ * offset 101342453, we would load the indirect block pointed to by pointer
+ * 0 in the dinode. We would then load the indirect block pointed to by
+ * pointer 48 in that indirect block. We would then load the data block
+ * pointed to by pointer 165 in that indirect block.
+ *
+ * ----------------------------------------
+ * | Dinode | |
+ * | | 4|
+ * | |0 1 2 3 4 5 9|
+ * | | 6|
+ * ----------------------------------------
+ * |
+ * |
+ * V
+ * ----------------------------------------
+ * | Indirect Block |
+ * | 5|
+ * | 4 4 4 4 4 5 5 1|
+ * |0 5 6 7 8 9 0 1 2|
+ * ----------------------------------------
+ * |
+ * |
+ * V
+ * ----------------------------------------
+ * | Indirect Block |
+ * | 1 1 1 1 1 5|
+ * | 6 6 6 6 6 1|
+ * |0 3 4 5 6 7 2|
+ * ----------------------------------------
+ * |
+ * |
+ * V
+ * ----------------------------------------
+ * | Data block containing offset |
+ * | 101342453 |
+ * | |
+ * | |
+ * ----------------------------------------
+ *
+ */
+
+static void find_metapath(struct gfs2_inode *ip, u64 block,
+ struct metapath *mp)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ u64 b = block;
+ unsigned int i;
+
+ for (i = ip->i_di.di_height; i--;)
+ mp->mp_list[i] = do_div(b, sdp->sd_inptrs);
+
+}
+
+/**
+ * metapointer - Return pointer to start of metadata in a buffer
+ * @bh: The buffer
+ * @height: The metadata height (0 = dinode)
+ * @mp: The metapath
+ *
+ * Return a pointer to the block number of the next height of the metadata
+ * tree given a buffer containing the pointer to the current height of the
+ * metadata tree.
+ */
+
+static inline u64 *metapointer(struct buffer_head *bh, int *boundary,
+ unsigned int height, const struct metapath *mp)
+{
+ unsigned int head_size = (height > 0) ?
+ sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
+ u64 *ptr;
+ *boundary = 0;
+ ptr = ((u64 *)(bh->b_data + head_size)) + mp->mp_list[height];
+ if (ptr + 1 == (u64 *)(bh->b_data + bh->b_size))
+ *boundary = 1;
+ return ptr;
+}
+
+/**
+ * lookup_block - Get the next metadata block in metadata tree
+ * @ip: The GFS2 inode
+ * @bh: Buffer containing the pointers to metadata blocks
+ * @height: The height of the tree (0 = dinode)
+ * @mp: The metapath
+ * @create: Non-zero if we may create a new meatdata block
+ * @new: Used to indicate if we did create a new metadata block
+ * @block: the returned disk block number
+ *
+ * Given a metatree, complete to a particular height, checks to see if the next
+ * height of the tree exists. If not the next height of the tree is created.
+ * The block number of the next height of the metadata tree is returned.
+ *
+ */
+
+static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh,
+ unsigned int height, struct metapath *mp, int create,
+ int *new, u64 *block)
+{
+ int boundary;
+ u64 *ptr = metapointer(bh, &boundary, height, mp);
+
+ if (*ptr) {
+ *block = be64_to_cpu(*ptr);
+ return boundary;
+ }
+
+ *block = 0;
+
+ if (!create)
+ return 0;
+
+ if (height == ip->i_di.di_height - 1 && !gfs2_is_dir(ip))
+ *block = gfs2_alloc_data(ip);
+ else
+ *block = gfs2_alloc_meta(ip);
+
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+
+ *ptr = cpu_to_be64(*block);
+ ip->i_di.di_blocks++;
+
+ *new = 1;
+ return 0;
+}
+
+/**
+ * gfs2_block_pointers - Map a block from an inode to a disk block
+ * @inode: The inode
+ * @lblock: The logical block number
+ * @map_bh: The bh to be mapped
+ * @mp: metapath to use
+ *
+ * Find the block number on the current device which corresponds to an
+ * inode's block. If the block had to be created, "new" will be set.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_block_pointers(struct inode *inode, u64 lblock, int create,
+ struct buffer_head *bh_map, struct metapath *mp,
+ unsigned int maxlen)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct buffer_head *bh;
+ unsigned int bsize;
+ unsigned int height;
+ unsigned int end_of_metadata;
+ unsigned int x;
+ int error = 0;
+ int new = 0;
+ u64 dblock = 0;
+ int boundary;
+
+ BUG_ON(maxlen == 0);
+
+ if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip)))
+ return 0;
+
+ bsize = gfs2_is_dir(ip) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize;
+
+ height = calc_tree_height(ip, (lblock + 1) * bsize);
+ if (ip->i_di.di_height < height) {
+ if (!create)
+ return 0;
+
+ error = build_height(inode, height);
+ if (error)
+ return error;
+ }
+
+ find_metapath(ip, lblock, mp);
+ end_of_metadata = ip->i_di.di_height - 1;
+
+ error = gfs2_meta_inode_buffer(ip, &bh);
+ if (error)
+ return error;
+
+ for (x = 0; x < end_of_metadata; x++) {
+ lookup_block(ip, bh, x, mp, create, &new, &dblock);
+ brelse(bh);
+ if (!dblock)
+ return 0;
+
+ error = gfs2_meta_indirect_buffer(ip, x+1, dblock, new, &bh);
+ if (error)
+ return error;
+ }
+
+ boundary = lookup_block(ip, bh, end_of_metadata, mp, create, &new, &dblock);
+ clear_buffer_mapped(bh_map);
+ clear_buffer_new(bh_map);
+ clear_buffer_boundary(bh_map);
+
+ if (dblock) {
+ map_bh(bh_map, inode->i_sb, dblock);
+ if (boundary)
+ set_buffer_boundary(bh);
+ if (new) {
+ struct buffer_head *dibh;
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (!error) {
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+ }
+ set_buffer_new(bh_map);
+ goto out_brelse;
+ }
+ while(--maxlen && !buffer_boundary(bh_map)) {
+ u64 eblock;
+
+ mp->mp_list[end_of_metadata]++;
+ boundary = lookup_block(ip, bh, end_of_metadata, mp, 0, &new, &eblock);
+ if (eblock != ++dblock)
+ break;
+ bh_map->b_size += (1 << inode->i_blkbits);
+ if (boundary)
+ set_buffer_boundary(bh_map);
+ }
+ }
+out_brelse:
+ brelse(bh);
+ return 0;
+}
+
+
+static inline void bmap_lock(struct inode *inode, int create)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ if (create)
+ down_write(&ip->i_rw_mutex);
+ else
+ down_read(&ip->i_rw_mutex);
+}
+
+static inline void bmap_unlock(struct inode *inode, int create)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ if (create)
+ up_write(&ip->i_rw_mutex);
+ else
+ up_read(&ip->i_rw_mutex);
+}
+
+int gfs2_block_map(struct inode *inode, u64 lblock, int create,
+ struct buffer_head *bh, unsigned int maxlen)
+{
+ struct metapath mp;
+ int ret;
+
+ bmap_lock(inode, create);
+ ret = gfs2_block_pointers(inode, lblock, create, bh, &mp, maxlen);
+ bmap_unlock(inode, create);
+ return ret;
+}
+
+int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
+{
+ struct metapath mp;
+ struct buffer_head bh = { .b_state = 0, .b_blocknr = 0, .b_size = 0 };
+ int ret;
+ int create = *new;
+
+ BUG_ON(!extlen);
+ BUG_ON(!dblock);
+ BUG_ON(!new);
+
+ bmap_lock(inode, create);
+ ret = gfs2_block_pointers(inode, lblock, create, &bh, &mp, 32);
+ bmap_unlock(inode, create);
+ *extlen = bh.b_size >> inode->i_blkbits;
+ *dblock = bh.b_blocknr;
+ if (buffer_new(&bh))
+ *new = 1;
+ else
+ *new = 0;
+ return ret;
+}
+
+/**
+ * recursive_scan - recursively scan through the end of a file
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @mp: the path through the metadata to the point to start
+ * @height: the height the recursion is at
+ * @block: the indirect block to look at
+ * @first: 1 if this is the first block
+ * @bc: the call to make for each piece of metadata
+ * @data: data opaque to this function to pass to @bc
+ *
+ * When this is first called @height and @block should be zero and
+ * @first should be 1.
+ *
+ * Returns: errno
+ */
+
+static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
+ struct metapath *mp, unsigned int height,
+ u64 block, int first, block_call_t bc,
+ void *data)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct buffer_head *bh = NULL;
+ u64 *top, *bottom;
+ u64 bn;
+ int error;
+ int mh_size = sizeof(struct gfs2_meta_header);
+
+ if (!height) {
+ error = gfs2_meta_inode_buffer(ip, &bh);
+ if (error)
+ return error;
+ dibh = bh;
+
+ top = (u64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
+ bottom = (u64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
+ } else {
+ error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
+ if (error)
+ return error;
+
+ top = (u64 *)(bh->b_data + mh_size) +
+ (first ? mp->mp_list[height] : 0);
+
+ bottom = (u64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
+ }
+
+ error = bc(ip, dibh, bh, top, bottom, height, data);
+ if (error)
+ goto out;
+
+ if (height < ip->i_di.di_height - 1)
+ for (; top < bottom; top++, first = 0) {
+ if (!*top)
+ continue;
+
+ bn = be64_to_cpu(*top);
+
+ error = recursive_scan(ip, dibh, mp, height + 1, bn,
+ first, bc, data);
+ if (error)
+ break;
+ }
+
+out:
+ brelse(bh);
+ return error;
+}
+
+/**
+ * do_strip - Look for a layer a particular layer of the file and strip it off
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @bh: A buffer of pointers
+ * @top: The first pointer in the buffer
+ * @bottom: One more than the last pointer
+ * @height: the height this buffer is at
+ * @data: a pointer to a struct strip_mine
+ *
+ * Returns: errno
+ */
+
+static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
+ struct buffer_head *bh, u64 *top, u64 *bottom,
+ unsigned int height, void *data)
+{
+ struct strip_mine *sm = data;
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_rgrp_list rlist;
+ u64 bn, bstart;
+ u32 blen;
+ u64 *p;
+ unsigned int rg_blocks = 0;
+ int metadata;
+ unsigned int revokes = 0;
+ int x;
+ int error;
+
+ if (!*top)
+ sm->sm_first = 0;
+
+ if (height != sm->sm_height)
+ return 0;
+
+ if (sm->sm_first) {
+ top++;
+ sm->sm_first = 0;
+ }
+
+ metadata = (height != ip->i_di.di_height - 1);
+ if (metadata)
+ revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
+
+ error = gfs2_rindex_hold(sdp, &ip->i_alloc.al_ri_gh);
+ if (error)
+ return error;
+
+ memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
+ bstart = 0;
+ blen = 0;
+
+ for (p = top; p < bottom; p++) {
+ if (!*p)
+ continue;
+
+ bn = be64_to_cpu(*p);
+
+ if (bstart + blen == bn)
+ blen++;
+ else {
+ if (bstart)
+ gfs2_rlist_add(sdp, &rlist, bstart);
+
+ bstart = bn;
+ blen = 1;
+ }
+ }
+
+ if (bstart)
+ gfs2_rlist_add(sdp, &rlist, bstart);
+ else
+ goto out; /* Nothing to do */
+
+ gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+
+ for (x = 0; x < rlist.rl_rgrps; x++) {
+ struct gfs2_rgrpd *rgd;
+ rgd = rlist.rl_ghs[x].gh_gl->gl_object;
+ rg_blocks += rgd->rd_ri.ri_length;
+ }
+
+ error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
+ if (error)
+ goto out_rlist;
+
+ error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
+ RES_INDIRECT + RES_STATFS + RES_QUOTA,
+ revokes);
+ if (error)
+ goto out_rg_gunlock;
+
+ down_write(&ip->i_rw_mutex);
+
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+
+ bstart = 0;
+ blen = 0;
+
+ for (p = top; p < bottom; p++) {
+ if (!*p)
+ continue;
+
+ bn = be64_to_cpu(*p);
+
+ if (bstart + blen == bn)
+ blen++;
+ else {
+ if (bstart) {
+ if (metadata)
+ gfs2_free_meta(ip, bstart, blen);
+ else
+ gfs2_free_data(ip, bstart, blen);
+ }
+
+ bstart = bn;
+ blen = 1;
+ }
+
+ *p = 0;
+ if (!ip->i_di.di_blocks)
+ gfs2_consist_inode(ip);
+ ip->i_di.di_blocks--;
+ }
+ if (bstart) {
+ if (metadata)
+ gfs2_free_meta(ip, bstart, blen);
+ else
+ gfs2_free_data(ip, bstart, blen);
+ }
+
+ ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+
+ up_write(&ip->i_rw_mutex);
+
+ gfs2_trans_end(sdp);
+
+out_rg_gunlock:
+ gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
+out_rlist:
+ gfs2_rlist_free(&rlist);
+out:
+ gfs2_glock_dq_uninit(&ip->i_alloc.al_ri_gh);
+ return error;
+}
+
+/**
+ * do_grow - Make a file look bigger than it is
+ * @ip: the inode
+ * @size: the size to set the file to
+ *
+ * Called with an exclusive lock on @ip.
+ *
+ * Returns: errno
+ */
+
+static int do_grow(struct gfs2_inode *ip, u64 size)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_alloc *al;
+ struct buffer_head *dibh;
+ unsigned int h;
+ int error;
+
+ al = gfs2_alloc_get(ip);
+
+ error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ if (error)
+ goto out;
+
+ error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
+ if (error)
+ goto out_gunlock_q;
+
+ al->al_requested = sdp->sd_max_height + RES_DATA;
+
+ error = gfs2_inplace_reserve(ip);
+ if (error)
+ goto out_gunlock_q;
+
+ error = gfs2_trans_begin(sdp,
+ sdp->sd_max_height + al->al_rgd->rd_ri.ri_length +
+ RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
+ if (error)
+ goto out_ipres;
+
+ if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
+ if (gfs2_is_stuffed(ip)) {
+ error = gfs2_unstuff_dinode(ip, NULL);
+ if (error)
+ goto out_end_trans;
+ }
+
+ h = calc_tree_height(ip, size);
+ if (ip->i_di.di_height < h) {
+ down_write(&ip->i_rw_mutex);
+ error = build_height(&ip->i_inode, h);
+ up_write(&ip->i_rw_mutex);
+ if (error)
+ goto out_end_trans;
+ }
+ }
+
+ ip->i_di.di_size = size;
+ ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ goto out_end_trans;
+
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+
+out_end_trans:
+ gfs2_trans_end(sdp);
+out_ipres:
+ gfs2_inplace_release(ip);
+out_gunlock_q:
+ gfs2_quota_unlock(ip);
+out:
+ gfs2_alloc_put(ip);
+ return error;
+}
+
+
+/**
+ * gfs2_block_truncate_page - Deal with zeroing out data for truncate
+ *
+ * This is partly borrowed from ext3.
+ */
+static int gfs2_block_truncate_page(struct address_space *mapping)
+{
+ struct inode *inode = mapping->host;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ loff_t from = inode->i_size;
+ unsigned long index = from >> PAGE_CACHE_SHIFT;
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned blocksize, iblock, length, pos;
+ struct buffer_head *bh;
+ struct page *page;
+ void *kaddr;
+ int err;
+
+ page = grab_cache_page(mapping, index);
+ if (!page)
+ return 0;
+
+ blocksize = inode->i_sb->s_blocksize;
+ length = blocksize - (offset & (blocksize - 1));
+ iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, blocksize, 0);
+
+ /* Find the buffer that contains "offset" */
+ bh = page_buffers(page);
+ pos = blocksize;
+ while (offset >= pos) {
+ bh = bh->b_this_page;
+ iblock++;
+ pos += blocksize;
+ }
+
+ err = 0;
+
+ if (!buffer_mapped(bh)) {
+ gfs2_get_block(inode, iblock, bh, 0);
+ /* unmapped? It's a hole - nothing to do */
+ if (!buffer_mapped(bh))
+ goto unlock;
+ }
+
+ /* Ok, it's mapped. Make sure it's up-to-date */
+ if (PageUptodate(page))
+ set_buffer_uptodate(bh);
+
+ if (!buffer_uptodate(bh)) {
+ err = -EIO;
+ ll_rw_block(READ, 1, &bh);
+ wait_on_buffer(bh);
+ /* Uhhuh. Read error. Complain and punt. */
+ if (!buffer_uptodate(bh))
+ goto unlock;
+ }
+
+ if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
+ gfs2_trans_add_bh(ip->i_gl, bh, 0);
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + offset, 0, length);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+
+unlock:
+ unlock_page(page);
+ page_cache_release(page);
+ return err;
+}
+
+static int trunc_start(struct gfs2_inode *ip, u64 size)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct buffer_head *dibh;
+ int journaled = gfs2_is_jdata(ip);
+ int error;
+
+ error = gfs2_trans_begin(sdp,
+ RES_DINODE + (journaled ? RES_JDATA : 0), 0);
+ if (error)
+ return error;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ goto out;
+
+ if (gfs2_is_stuffed(ip)) {
+ ip->i_di.di_size = size;
+ ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size);
+ error = 1;
+
+ } else {
+ if (size & (u64)(sdp->sd_sb.sb_bsize - 1))
+ error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
+
+ if (!error) {
+ ip->i_di.di_size = size;
+ ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+ ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG;
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ }
+ }
+
+ brelse(dibh);
+
+out:
+ gfs2_trans_end(sdp);
+ return error;
+}
+
+static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
+{
+ unsigned int height = ip->i_di.di_height;
+ u64 lblock;
+ struct metapath mp;
+ int error;
+
+ if (!size)
+ lblock = 0;
+ else
+ lblock = (size - 1) >> GFS2_SB(&ip->i_inode)->sd_sb.sb_bsize_shift;
+
+ find_metapath(ip, lblock, &mp);
+ gfs2_alloc_get(ip);
+
+ error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ if (error)
+ goto out;
+
+ while (height--) {
+ struct strip_mine sm;
+ sm.sm_first = !!size;
+ sm.sm_height = height;
+
+ error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm);
+ if (error)
+ break;
+ }
+
+ gfs2_quota_unhold(ip);
+
+out:
+ gfs2_alloc_put(ip);
+ return error;
+}
+
+static int trunc_end(struct gfs2_inode *ip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct buffer_head *dibh;
+ int error;
+
+ error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+ if (error)
+ return error;
+
+ down_write(&ip->i_rw_mutex);
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ goto out;
+
+ if (!ip->i_di.di_size) {
+ ip->i_di.di_height = 0;
+ ip->i_di.di_goal_meta =
+ ip->i_di.di_goal_data =
+ ip->i_num.no_addr;
+ gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+ }
+ ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+ ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG;
+
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+
+out:
+ up_write(&ip->i_rw_mutex);
+ gfs2_trans_end(sdp);
+ return error;
+}
+
+/**
+ * do_shrink - make a file smaller
+ * @ip: the inode
+ * @size: the size to make the file
+ * @truncator: function to truncate the last partial block
+ *
+ * Called with an exclusive lock on @ip.
+ *
+ * Returns: errno
+ */
+
+static int do_shrink(struct gfs2_inode *ip, u64 size)
+{
+ int error;
+
+ error = trunc_start(ip, size);
+ if (error < 0)
+ return error;
+ if (error > 0)
+ return 0;
+
+ error = trunc_dealloc(ip, size);
+ if (!error)
+ error = trunc_end(ip);
+
+ return error;
+}
+
+/**
+ * gfs2_truncatei - make a file a given size
+ * @ip: the inode
+ * @size: the size to make the file
+ * @truncator: function to truncate the last partial block
+ *
+ * The file size can grow, shrink, or stay the same size.
+ *
+ * Returns: errno
+ */
+
+int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
+{
+ int error;
+
+ if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_di.di_mode)))
+ return -EINVAL;
+
+ if (size > ip->i_di.di_size)
+ error = do_grow(ip, size);
+ else
+ error = do_shrink(ip, size);
+
+ return error;
+}
+
+int gfs2_truncatei_resume(struct gfs2_inode *ip)
+{
+ int error;
+ error = trunc_dealloc(ip, ip->i_di.di_size);
+ if (!error)
+ error = trunc_end(ip);
+ return error;
+}
+
+int gfs2_file_dealloc(struct gfs2_inode *ip)
+{
+ return trunc_dealloc(ip, 0);
+}
+
+/**
+ * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
+ * @ip: the file
+ * @len: the number of bytes to be written to the file
+ * @data_blocks: returns the number of data blocks required
+ * @ind_blocks: returns the number of indirect blocks required
+ *
+ */
+
+void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
+ unsigned int *data_blocks, unsigned int *ind_blocks)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ unsigned int tmp;
+
+ if (gfs2_is_dir(ip)) {
+ *data_blocks = DIV_ROUND_UP(len, sdp->sd_jbsize) + 2;
+ *ind_blocks = 3 * (sdp->sd_max_jheight - 1);
+ } else {
+ *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
+ *ind_blocks = 3 * (sdp->sd_max_height - 1);
+ }
+
+ for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
+ tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+ *ind_blocks += tmp;
+ }
+}
+
+/**
+ * gfs2_write_alloc_required - figure out if a write will require an allocation
+ * @ip: the file being written to
+ * @offset: the offset to write to
+ * @len: the number of bytes being written
+ * @alloc_required: set to 1 if an alloc is required, 0 otherwise
+ *
+ * Returns: errno
+ */
+
+int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
+ unsigned int len, int *alloc_required)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ u64 lblock, lblock_stop, dblock;
+ u32 extlen;
+ int new = 0;
+ int error = 0;
+
+ *alloc_required = 0;
+
+ if (!len)
+ return 0;
+
+ if (gfs2_is_stuffed(ip)) {
+ if (offset + len >
+ sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
+ *alloc_required = 1;
+ return 0;
+ }
+
+ if (gfs2_is_dir(ip)) {
+ unsigned int bsize = sdp->sd_jbsize;
+ lblock = offset;
+ do_div(lblock, bsize);
+ lblock_stop = offset + len + bsize - 1;
+ do_div(lblock_stop, bsize);
+ } else {
+ unsigned int shift = sdp->sd_sb.sb_bsize_shift;
+ lblock = offset >> shift;
+ lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
+ }
+
+ for (; lblock < lblock_stop; lblock += extlen) {
+ error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
+ if (error)
+ return error;
+
+ if (!dblock) {
+ *alloc_required = 1;
+ return 0;
+ }
+ }
+
+ return 0;
+}
+
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
new file mode 100644
index 000000000000..0fd379b4cd9e
--- /dev/null
+++ b/fs/gfs2/bmap.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __BMAP_DOT_H__
+#define __BMAP_DOT_H__
+
+struct inode;
+struct gfs2_inode;
+struct page;
+
+int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
+int gfs2_block_map(struct inode *inode, u64 lblock, int create, struct buffer_head *bh, unsigned int maxlen);
+int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
+
+int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
+int gfs2_truncatei_resume(struct gfs2_inode *ip);
+int gfs2_file_dealloc(struct gfs2_inode *ip);
+
+void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
+ unsigned int *data_blocks,
+ unsigned int *ind_blocks);
+int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
+ unsigned int len, int *alloc_required);
+
+#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
new file mode 100644
index 000000000000..cab1f68d4685
--- /dev/null
+++ b/fs/gfs2/daemon.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "daemon.h"
+#include "glock.h"
+#include "log.h"
+#include "quota.h"
+#include "recovery.h"
+#include "super.h"
+#include "util.h"
+
+/* This uses schedule_timeout() instead of msleep() because it's good for
+ the daemons to wake up more often than the timeout when unmounting so
+ the user's unmount doesn't sit there forever.
+
+ The kthread functions used to start these daemons block and flush signals. */
+
+/**
+ * gfs2_scand - Look for cached glocks and inodes to toss from memory
+ * @sdp: Pointer to GFS2 superblock
+ *
+ * One of these daemons runs, finding candidates to add to sd_reclaim_list.
+ * See gfs2_glockd()
+ */
+
+int gfs2_scand(void *data)
+{
+ struct gfs2_sbd *sdp = data;
+ unsigned long t;
+
+ while (!kthread_should_stop()) {
+ gfs2_scand_internal(sdp);
+ t = gfs2_tune_get(sdp, gt_scand_secs) * HZ;
+ schedule_timeout_interruptible(t);
+ }
+
+ return 0;
+}
+
+/**
+ * gfs2_glockd - Reclaim unused glock structures
+ * @sdp: Pointer to GFS2 superblock
+ *
+ * One or more of these daemons run, reclaiming glocks on sd_reclaim_list.
+ * Number of daemons can be set by user, with num_glockd mount option.
+ */
+
+int gfs2_glockd(void *data)
+{
+ struct gfs2_sbd *sdp = data;
+
+ while (!kthread_should_stop()) {
+ while (atomic_read(&sdp->sd_reclaim_count))
+ gfs2_reclaim_glock(sdp);
+
+ wait_event_interruptible(sdp->sd_reclaim_wq,
+ (atomic_read(&sdp->sd_reclaim_count) ||
+ kthread_should_stop()));
+ }
+
+ return 0;
+}
+
+/**
+ * gfs2_recoverd - Recover dead machine's journals
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+
+int gfs2_recoverd(void *data)
+{
+ struct gfs2_sbd *sdp = data;
+ unsigned long t;
+
+ while (!kthread_should_stop()) {
+ gfs2_check_journals(sdp);
+ t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ;
+ schedule_timeout_interruptible(t);
+ }
+
+ return 0;
+}
+
+/**
+ * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
+ * @sdp: Pointer to GFS2 superblock
+ *
+ * Also, periodically check to make sure that we're using the most recent
+ * journal index.
+ */
+
+int gfs2_logd(void *data)
+{
+ struct gfs2_sbd *sdp = data;
+ struct gfs2_holder ji_gh;
+ unsigned long t;
+
+ while (!kthread_should_stop()) {
+ /* Advance the log tail */
+
+ t = sdp->sd_log_flush_time +
+ gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
+
+ gfs2_ail1_empty(sdp, DIO_ALL);
+
+ if (time_after_eq(jiffies, t)) {
+ gfs2_log_flush(sdp, NULL);
+ sdp->sd_log_flush_time = jiffies;
+ }
+
+ /* Check for latest journal index */
+
+ t = sdp->sd_jindex_refresh_time +
+ gfs2_tune_get(sdp, gt_jindex_refresh_secs) * HZ;
+
+ if (time_after_eq(jiffies, t)) {
+ if (!gfs2_jindex_hold(sdp, &ji_gh))
+ gfs2_glock_dq_uninit(&ji_gh);
+ sdp->sd_jindex_refresh_time = jiffies;
+ }
+
+ t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
+ schedule_timeout_interruptible(t);
+ }
+
+ return 0;
+}
+
+/**
+ * gfs2_quotad - Write cached quota changes into the quota file
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+
+int gfs2_quotad(void *data)
+{
+ struct gfs2_sbd *sdp = data;
+ unsigned long t;
+ int error;
+
+ while (!kthread_should_stop()) {
+ /* Update the master statfs file */
+
+ t = sdp->sd_statfs_sync_time +
+ gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
+
+ if (time_after_eq(jiffies, t)) {
+ error = gfs2_statfs_sync(sdp);
+ if (error &&
+ error != -EROFS &&
+ !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+ fs_err(sdp, "quotad: (1) error=%d\n", error);
+ sdp->sd_statfs_sync_time = jiffies;
+ }
+
+ /* Update quota file */
+
+ t = sdp->sd_quota_sync_time +
+ gfs2_tune_get(sdp, gt_quota_quantum) * HZ;
+
+ if (time_after_eq(jiffies, t)) {
+ error = gfs2_quota_sync(sdp);
+ if (error &&
+ error != -EROFS &&
+ !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+ fs_err(sdp, "quotad: (2) error=%d\n", error);
+ sdp->sd_quota_sync_time = jiffies;
+ }
+
+ gfs2_quota_scan(sdp);
+
+ t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ;
+ schedule_timeout_interruptible(t);
+ }
+
+ return 0;
+}
+
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
new file mode 100644
index 000000000000..801007120fb2
--- /dev/null
+++ b/fs/gfs2/daemon.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __DAEMON_DOT_H__
+#define __DAEMON_DOT_H__
+
+int gfs2_scand(void *data);
+int gfs2_glockd(void *data);
+int gfs2_recoverd(void *data);
+int gfs2_logd(void *data);
+int gfs2_quotad(void *data);
+
+#endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
new file mode 100644
index 000000000000..459498cac93b
--- /dev/null
+++ b/fs/gfs2/dir.c
@@ -0,0 +1,1961 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+/*
+ * Implements Extendible Hashing as described in:
+ * "Extendible Hashing" by Fagin, et al in
+ * __ACM Trans. on Database Systems__, Sept 1979.
+ *
+ *
+ * Here's the layout of dirents which is essentially the same as that of ext2
+ * within a single block. The field de_name_len is the number of bytes
+ * actually required for the name (no null terminator). The field de_rec_len
+ * is the number of bytes allocated to the dirent. The offset of the next
+ * dirent in the block is (dirent + dirent->de_rec_len). When a dirent is
+ * deleted, the preceding dirent inherits its allocated space, ie
+ * prev->de_rec_len += deleted->de_rec_len. Since the next dirent is obtained
+ * by adding de_rec_len to the current dirent, this essentially causes the
+ * deleted dirent to get jumped over when iterating through all the dirents.
+ *
+ * When deleting the first dirent in a block, there is no previous dirent so
+ * the field de_ino is set to zero to designate it as deleted. When allocating
+ * a dirent, gfs2_dirent_alloc iterates through the dirents in a block. If the
+ * first dirent has (de_ino == 0) and de_rec_len is large enough, this first
+ * dirent is allocated. Otherwise it must go through all the 'used' dirents
+ * searching for one in which the amount of total space minus the amount of
+ * used space will provide enough space for the new dirent.
+ *
+ * There are two types of blocks in which dirents reside. In a stuffed dinode,
+ * the dirents begin at offset sizeof(struct gfs2_dinode) from the beginning of
+ * the block. In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
+ * beginning of the leaf block. The dirents reside in leaves when
+ *
+ * dip->i_di.di_flags & GFS2_DIF_EXHASH is true
+ *
+ * Otherwise, the dirents are "linear", within a single stuffed dinode block.
+ *
+ * When the dirents are in leaves, the actual contents of the directory file are
+ * used as an array of 64-bit block pointers pointing to the leaf blocks. The
+ * dirents are NOT in the directory file itself. There can be more than one
+ * block pointer in the array that points to the same leaf. In fact, when a
+ * directory is first converted from linear to exhash, all of the pointers
+ * point to the same leaf.
+ *
+ * When a leaf is completely full, the size of the hash table can be
+ * doubled unless it is already at the maximum size which is hard coded into
+ * GFS2_DIR_MAX_DEPTH. After that, leaves are chained together in a linked list,
+ * but never before the maximum hash table size has been reached.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/buffer_head.h>
+#include <linux/sort.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/vmalloc.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "dir.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "bmap.h"
+#include "util.h"
+
+#define IS_LEAF 1 /* Hashed (leaf) directory */
+#define IS_DINODE 2 /* Linear (stuffed dinode block) directory */
+
+#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
+#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
+
+typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len,
+ u64 leaf_no, void *data);
+typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
+ const struct qstr *name, void *opaque);
+
+
+int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
+ struct buffer_head **bhp)
+{
+ struct buffer_head *bh;
+
+ bh = gfs2_meta_new(ip->i_gl, block);
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD);
+ gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
+ *bhp = bh;
+ return 0;
+}
+
+static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block,
+ struct buffer_head **bhp)
+{
+ struct buffer_head *bh;
+ int error;
+
+ error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, &bh);
+ if (error)
+ return error;
+ if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) {
+ brelse(bh);
+ return -EIO;
+ }
+ *bhp = bh;
+ return 0;
+}
+
+static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
+ unsigned int offset, unsigned int size)
+{
+ struct buffer_head *dibh;
+ int error;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ return error;
+
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
+ if (ip->i_di.di_size < offset + size)
+ ip->i_di.di_size = offset + size;
+ ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+
+ brelse(dibh);
+
+ return size;
+}
+
+
+
+/**
+ * gfs2_dir_write_data - Write directory information to the inode
+ * @ip: The GFS2 inode
+ * @buf: The buffer containing information to be written
+ * @offset: The file offset to start writing at
+ * @size: The amount of data to write
+ *
+ * Returns: The number of bytes correctly written or error code
+ */
+static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
+ u64 offset, unsigned int size)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct buffer_head *dibh;
+ u64 lblock, dblock;
+ u32 extlen = 0;
+ unsigned int o;
+ int copied = 0;
+ int error = 0;
+
+ if (!size)
+ return 0;
+
+ if (gfs2_is_stuffed(ip) &&
+ offset + size <= sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
+ return gfs2_dir_write_stuffed(ip, buf, (unsigned int)offset,
+ size);
+
+ if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
+ return -EINVAL;
+
+ if (gfs2_is_stuffed(ip)) {
+ error = gfs2_unstuff_dinode(ip, NULL);
+ if (error)
+ return error;
+ }
+
+ lblock = offset;
+ o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
+
+ while (copied < size) {
+ unsigned int amount;
+ struct buffer_head *bh;
+ int new;
+
+ amount = size - copied;
+ if (amount > sdp->sd_sb.sb_bsize - o)
+ amount = sdp->sd_sb.sb_bsize - o;
+
+ if (!extlen) {
+ new = 1;
+ error = gfs2_extent_map(&ip->i_inode, lblock, &new,
+ &dblock, &extlen);
+ if (error)
+ goto fail;
+ error = -EIO;
+ if (gfs2_assert_withdraw(sdp, dblock))
+ goto fail;
+ }
+
+ if (amount == sdp->sd_jbsize || new)
+ error = gfs2_dir_get_new_buffer(ip, dblock, &bh);
+ else
+ error = gfs2_dir_get_existing_buffer(ip, dblock, &bh);
+
+ if (error)
+ goto fail;
+
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ memcpy(bh->b_data + o, buf, amount);
+ brelse(bh);
+ if (error)
+ goto fail;
+
+ buf += amount;
+ copied += amount;
+ lblock++;
+ dblock++;
+ extlen--;
+
+ o = sizeof(struct gfs2_meta_header);
+ }
+
+out:
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ return error;
+
+ if (ip->i_di.di_size < offset + copied)
+ ip->i_di.di_size = offset + copied;
+ ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+
+ return copied;
+fail:
+ if (copied)
+ goto out;
+ return error;
+}
+
+static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf,
+ u64 offset, unsigned int size)
+{
+ struct buffer_head *dibh;
+ int error;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (!error) {
+ offset += sizeof(struct gfs2_dinode);
+ memcpy(buf, dibh->b_data + offset, size);
+ brelse(dibh);
+ }
+
+ return (error) ? error : size;
+}
+
+
+/**
+ * gfs2_dir_read_data - Read a data from a directory inode
+ * @ip: The GFS2 Inode
+ * @buf: The buffer to place result into
+ * @offset: File offset to begin jdata_readng from
+ * @size: Amount of data to transfer
+ *
+ * Returns: The amount of data actually copied or the error
+ */
+static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
+ unsigned int size, unsigned ra)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ u64 lblock, dblock;
+ u32 extlen = 0;
+ unsigned int o;
+ int copied = 0;
+ int error = 0;
+
+ if (offset >= ip->i_di.di_size)
+ return 0;
+
+ if (offset + size > ip->i_di.di_size)
+ size = ip->i_di.di_size - offset;
+
+ if (!size)
+ return 0;
+
+ if (gfs2_is_stuffed(ip))
+ return gfs2_dir_read_stuffed(ip, buf, offset, size);
+
+ if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
+ return -EINVAL;
+
+ lblock = offset;
+ o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
+
+ while (copied < size) {
+ unsigned int amount;
+ struct buffer_head *bh;
+ int new;
+
+ amount = size - copied;
+ if (amount > sdp->sd_sb.sb_bsize - o)
+ amount = sdp->sd_sb.sb_bsize - o;
+
+ if (!extlen) {
+ new = 0;
+ error = gfs2_extent_map(&ip->i_inode, lblock, &new,
+ &dblock, &extlen);
+ if (error || !dblock)
+ goto fail;
+ BUG_ON(extlen < 1);
+ if (!ra)
+ extlen = 1;
+ bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
+ }
+ if (!bh) {
+ error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh);
+ if (error)
+ goto fail;
+ }
+ error = gfs2_metatype_check(sdp, bh, GFS2_METATYPE_JD);
+ if (error) {
+ brelse(bh);
+ goto fail;
+ }
+ dblock++;
+ extlen--;
+ memcpy(buf, bh->b_data + o, amount);
+ brelse(bh);
+ bh = NULL;
+ buf += amount;
+ copied += amount;
+ lblock++;
+ o = sizeof(struct gfs2_meta_header);
+ }
+
+ return copied;
+fail:
+ return (copied) ? copied : error;
+}
+
+static inline int __gfs2_dirent_find(const struct gfs2_dirent *dent,
+ const struct qstr *name, int ret)
+{
+ if (dent->de_inum.no_addr != 0 &&
+ be32_to_cpu(dent->de_hash) == name->hash &&
+ be16_to_cpu(dent->de_name_len) == name->len &&
+ memcmp(dent+1, name->name, name->len) == 0)
+ return ret;
+ return 0;
+}
+
+static int gfs2_dirent_find(const struct gfs2_dirent *dent,
+ const struct qstr *name,
+ void *opaque)
+{
+ return __gfs2_dirent_find(dent, name, 1);
+}
+
+static int gfs2_dirent_prev(const struct gfs2_dirent *dent,
+ const struct qstr *name,
+ void *opaque)
+{
+ return __gfs2_dirent_find(dent, name, 2);
+}
+
+/*
+ * name->name holds ptr to start of block.
+ * name->len holds size of block.
+ */
+static int gfs2_dirent_last(const struct gfs2_dirent *dent,
+ const struct qstr *name,
+ void *opaque)
+{
+ const char *start = name->name;
+ const char *end = (const char *)dent + be16_to_cpu(dent->de_rec_len);
+ if (name->len == (end - start))
+ return 1;
+ return 0;
+}
+
+static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
+ const struct qstr *name,
+ void *opaque)
+{
+ unsigned required = GFS2_DIRENT_SIZE(name->len);
+ unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
+ unsigned totlen = be16_to_cpu(dent->de_rec_len);
+
+ if (!dent->de_inum.no_addr)
+ actual = GFS2_DIRENT_SIZE(0);
+ if (totlen - actual >= required)
+ return 1;
+ return 0;
+}
+
+struct dirent_gather {
+ const struct gfs2_dirent **pdent;
+ unsigned offset;
+};
+
+static int gfs2_dirent_gather(const struct gfs2_dirent *dent,
+ const struct qstr *name,
+ void *opaque)
+{
+ struct dirent_gather *g = opaque;
+ if (dent->de_inum.no_addr) {
+ g->pdent[g->offset++] = dent;
+ }
+ return 0;
+}
+
+/*
+ * Other possible things to check:
+ * - Inode located within filesystem size (and on valid block)
+ * - Valid directory entry type
+ * Not sure how heavy-weight we want to make this... could also check
+ * hash is correct for example, but that would take a lot of extra time.
+ * For now the most important thing is to check that the various sizes
+ * are correct.
+ */
+static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset,
+ unsigned int size, unsigned int len, int first)
+{
+ const char *msg = "gfs2_dirent too small";
+ if (unlikely(size < sizeof(struct gfs2_dirent)))
+ goto error;
+ msg = "gfs2_dirent misaligned";
+ if (unlikely(offset & 0x7))
+ goto error;
+ msg = "gfs2_dirent points beyond end of block";
+ if (unlikely(offset + size > len))
+ goto error;
+ msg = "zero inode number";
+ if (unlikely(!first && !dent->de_inum.no_addr))
+ goto error;
+ msg = "name length is greater than space in dirent";
+ if (dent->de_inum.no_addr &&
+ unlikely(sizeof(struct gfs2_dirent)+be16_to_cpu(dent->de_name_len) >
+ size))
+ goto error;
+ return 0;
+error:
+ printk(KERN_WARNING "gfs2_check_dirent: %s (%s)\n", msg,
+ first ? "first in block" : "not first in block");
+ return -EIO;
+}
+
+static int gfs2_dirent_offset(const void *buf)
+{
+ const struct gfs2_meta_header *h = buf;
+ int offset;
+
+ BUG_ON(buf == NULL);
+
+ switch(be32_to_cpu(h->mh_type)) {
+ case GFS2_METATYPE_LF:
+ offset = sizeof(struct gfs2_leaf);
+ break;
+ case GFS2_METATYPE_DI:
+ offset = sizeof(struct gfs2_dinode);
+ break;
+ default:
+ goto wrong_type;
+ }
+ return offset;
+wrong_type:
+ printk(KERN_WARNING "gfs2_scan_dirent: wrong block type %u\n",
+ be32_to_cpu(h->mh_type));
+ return -1;
+}
+
+static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode, void *buf,
+ unsigned int len, gfs2_dscan_t scan,
+ const struct qstr *name,
+ void *opaque)
+{
+ struct gfs2_dirent *dent, *prev;
+ unsigned offset;
+ unsigned size;
+ int ret = 0;
+
+ ret = gfs2_dirent_offset(buf);
+ if (ret < 0)
+ goto consist_inode;
+
+ offset = ret;
+ prev = NULL;
+ dent = buf + offset;
+ size = be16_to_cpu(dent->de_rec_len);
+ if (gfs2_check_dirent(dent, offset, size, len, 1))
+ goto consist_inode;
+ do {
+ ret = scan(dent, name, opaque);
+ if (ret)
+ break;
+ offset += size;
+ if (offset == len)
+ break;
+ prev = dent;
+ dent = buf + offset;
+ size = be16_to_cpu(dent->de_rec_len);
+ if (gfs2_check_dirent(dent, offset, size, len, 0))
+ goto consist_inode;
+ } while(1);
+
+ switch(ret) {
+ case 0:
+ return NULL;
+ case 1:
+ return dent;
+ case 2:
+ return prev ? prev : dent;
+ default:
+ BUG_ON(ret > 0);
+ return ERR_PTR(ret);
+ }
+
+consist_inode:
+ gfs2_consist_inode(GFS2_I(inode));
+ return ERR_PTR(-EIO);
+}
+
+
+/**
+ * dirent_first - Return the first dirent
+ * @dip: the directory
+ * @bh: The buffer
+ * @dent: Pointer to list of dirents
+ *
+ * return first dirent whether bh points to leaf or stuffed dinode
+ *
+ * Returns: IS_LEAF, IS_DINODE, or -errno
+ */
+
+static int dirent_first(struct gfs2_inode *dip, struct buffer_head *bh,
+ struct gfs2_dirent **dent)
+{
+ struct gfs2_meta_header *h = (struct gfs2_meta_header *)bh->b_data;
+
+ if (be32_to_cpu(h->mh_type) == GFS2_METATYPE_LF) {
+ if (gfs2_meta_check(GFS2_SB(&dip->i_inode), bh))
+ return -EIO;
+ *dent = (struct gfs2_dirent *)(bh->b_data +
+ sizeof(struct gfs2_leaf));
+ return IS_LEAF;
+ } else {
+ if (gfs2_metatype_check(GFS2_SB(&dip->i_inode), bh, GFS2_METATYPE_DI))
+ return -EIO;
+ *dent = (struct gfs2_dirent *)(bh->b_data +
+ sizeof(struct gfs2_dinode));
+ return IS_DINODE;
+ }
+}
+
+static int dirent_check_reclen(struct gfs2_inode *dip,
+ const struct gfs2_dirent *d, const void *end_p)
+{
+ const void *ptr = d;
+ u16 rec_len = be16_to_cpu(d->de_rec_len);
+
+ if (unlikely(rec_len < sizeof(struct gfs2_dirent)))
+ goto broken;
+ ptr += rec_len;
+ if (ptr < end_p)
+ return rec_len;
+ if (ptr == end_p)
+ return -ENOENT;
+broken:
+ gfs2_consist_inode(dip);
+ return -EIO;
+}
+
+/**
+ * dirent_next - Next dirent
+ * @dip: the directory
+ * @bh: The buffer
+ * @dent: Pointer to list of dirents
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+
+static int dirent_next(struct gfs2_inode *dip, struct buffer_head *bh,
+ struct gfs2_dirent **dent)
+{
+ struct gfs2_dirent *cur = *dent, *tmp;
+ char *bh_end = bh->b_data + bh->b_size;
+ int ret;
+
+ ret = dirent_check_reclen(dip, cur, bh_end);
+ if (ret < 0)
+ return ret;
+
+ tmp = (void *)cur + ret;
+ ret = dirent_check_reclen(dip, tmp, bh_end);
+ if (ret == -EIO)
+ return ret;
+
+ /* Only the first dent could ever have de_inum.no_addr == 0 */
+ if (!tmp->de_inum.no_addr) {
+ gfs2_consist_inode(dip);
+ return -EIO;
+ }
+
+ *dent = tmp;
+ return 0;
+}
+
+/**
+ * dirent_del - Delete a dirent
+ * @dip: The GFS2 inode
+ * @bh: The buffer
+ * @prev: The previous dirent
+ * @cur: The current dirent
+ *
+ */
+
+static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
+ struct gfs2_dirent *prev, struct gfs2_dirent *cur)
+{
+ u16 cur_rec_len, prev_rec_len;
+
+ if (!cur->de_inum.no_addr) {
+ gfs2_consist_inode(dip);
+ return;
+ }
+
+ gfs2_trans_add_bh(dip->i_gl, bh, 1);
+
+ /* If there is no prev entry, this is the first entry in the block.
+ The de_rec_len is already as big as it needs to be. Just zero
+ out the inode number and return. */
+
+ if (!prev) {
+ cur->de_inum.no_addr = 0; /* No endianess worries */
+ return;
+ }
+
+ /* Combine this dentry with the previous one. */
+
+ prev_rec_len = be16_to_cpu(prev->de_rec_len);
+ cur_rec_len = be16_to_cpu(cur->de_rec_len);
+
+ if ((char *)prev + prev_rec_len != (char *)cur)
+ gfs2_consist_inode(dip);
+ if ((char *)cur + cur_rec_len > bh->b_data + bh->b_size)
+ gfs2_consist_inode(dip);
+
+ prev_rec_len += cur_rec_len;
+ prev->de_rec_len = cpu_to_be16(prev_rec_len);
+}
+
+/*
+ * Takes a dent from which to grab space as an argument. Returns the
+ * newly created dent.
+ */
+static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,
+ struct gfs2_dirent *dent,
+ const struct qstr *name,
+ struct buffer_head *bh)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_dirent *ndent;
+ unsigned offset = 0, totlen;
+
+ if (dent->de_inum.no_addr)
+ offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
+ totlen = be16_to_cpu(dent->de_rec_len);
+ BUG_ON(offset + name->len > totlen);
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ ndent = (struct gfs2_dirent *)((char *)dent + offset);
+ dent->de_rec_len = cpu_to_be16(offset);
+ gfs2_qstr2dirent(name, totlen - offset, ndent);
+ return ndent;
+}
+
+static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode,
+ struct buffer_head *bh,
+ const struct qstr *name)
+{
+ struct gfs2_dirent *dent;
+ dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
+ gfs2_dirent_find_space, name, NULL);
+ if (!dent || IS_ERR(dent))
+ return dent;
+ return gfs2_init_dirent(inode, dent, name, bh);
+}
+
+static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
+ struct buffer_head **bhp)
+{
+ int error;
+
+ error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp);
+ if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) {
+ /* printk(KERN_INFO "block num=%llu\n", leaf_no); */
+ error = -EIO;
+ }
+
+ return error;
+}
+
+/**
+ * get_leaf_nr - Get a leaf number associated with the index
+ * @dip: The GFS2 inode
+ * @index:
+ * @leaf_out:
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+
+static int get_leaf_nr(struct gfs2_inode *dip, u32 index,
+ u64 *leaf_out)
+{
+ u64 leaf_no;
+ int error;
+
+ error = gfs2_dir_read_data(dip, (char *)&leaf_no,
+ index * sizeof(u64),
+ sizeof(u64), 0);
+ if (error != sizeof(u64))
+ return (error < 0) ? error : -EIO;
+
+ *leaf_out = be64_to_cpu(leaf_no);
+
+ return 0;
+}
+
+static int get_first_leaf(struct gfs2_inode *dip, u32 index,
+ struct buffer_head **bh_out)
+{
+ u64 leaf_no;
+ int error;
+
+ error = get_leaf_nr(dip, index, &leaf_no);
+ if (!error)
+ error = get_leaf(dip, leaf_no, bh_out);
+
+ return error;
+}
+
+static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
+ const struct qstr *name,
+ gfs2_dscan_t scan,
+ struct buffer_head **pbh)
+{
+ struct buffer_head *bh;
+ struct gfs2_dirent *dent;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ int error;
+
+ if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
+ struct gfs2_leaf *leaf;
+ unsigned hsize = 1 << ip->i_di.di_depth;
+ unsigned index;
+ u64 ln;
+ if (hsize * sizeof(u64) != ip->i_di.di_size) {
+ gfs2_consist_inode(ip);
+ return ERR_PTR(-EIO);
+ }
+
+ index = name->hash >> (32 - ip->i_di.di_depth);
+ error = get_first_leaf(ip, index, &bh);
+ if (error)
+ return ERR_PTR(error);
+ do {
+ dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
+ scan, name, NULL);
+ if (dent)
+ goto got_dent;
+ leaf = (struct gfs2_leaf *)bh->b_data;
+ ln = be64_to_cpu(leaf->lf_next);
+ brelse(bh);
+ if (!ln)
+ break;
+
+ error = get_leaf(ip, ln, &bh);
+ } while(!error);
+
+ return error ? ERR_PTR(error) : NULL;
+ }
+
+
+ error = gfs2_meta_inode_buffer(ip, &bh);
+ if (error)
+ return ERR_PTR(error);
+ dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, scan, name, NULL);
+got_dent:
+ if (unlikely(dent == NULL || IS_ERR(dent))) {
+ brelse(bh);
+ bh = NULL;
+ }
+ *pbh = bh;
+ return dent;
+}
+
+static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ u64 bn = gfs2_alloc_meta(ip);
+ struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn);
+ struct gfs2_leaf *leaf;
+ struct gfs2_dirent *dent;
+ struct qstr name = { .name = "", .len = 0, .hash = 0 };
+ if (!bh)
+ return NULL;
+
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
+ leaf = (struct gfs2_leaf *)bh->b_data;
+ leaf->lf_depth = cpu_to_be16(depth);
+ leaf->lf_entries = 0;
+ leaf->lf_dirent_format = cpu_to_be16(GFS2_FORMAT_DE);
+ leaf->lf_next = 0;
+ memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved));
+ dent = (struct gfs2_dirent *)(leaf+1);
+ gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent);
+ *pbh = bh;
+ return leaf;
+}
+
+/**
+ * dir_make_exhash - Convert a stuffed directory into an ExHash directory
+ * @dip: The GFS2 inode
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+
+static int dir_make_exhash(struct inode *inode)
+{
+ struct gfs2_inode *dip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct gfs2_dirent *dent;
+ struct qstr args;
+ struct buffer_head *bh, *dibh;
+ struct gfs2_leaf *leaf;
+ int y;
+ u32 x;
+ u64 *lp, bn;
+ int error;
+
+ error = gfs2_meta_inode_buffer(dip, &dibh);
+ if (error)
+ return error;
+
+ /* Turn over a new leaf */
+
+ leaf = new_leaf(inode, &bh, 0);
+ if (!leaf)
+ return -ENOSPC;
+ bn = bh->b_blocknr;
+
+ gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16));
+ leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries);
+
+ /* Copy dirents */
+
+ gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_leaf), dibh,
+ sizeof(struct gfs2_dinode));
+
+ /* Find last entry */
+
+ x = 0;
+ args.len = bh->b_size - sizeof(struct gfs2_dinode) +
+ sizeof(struct gfs2_leaf);
+ args.name = bh->b_data;
+ dent = gfs2_dirent_scan(&dip->i_inode, bh->b_data, bh->b_size,
+ gfs2_dirent_last, &args, NULL);
+ if (!dent) {
+ brelse(bh);
+ brelse(dibh);
+ return -EIO;
+ }
+ if (IS_ERR(dent)) {
+ brelse(bh);
+ brelse(dibh);
+ return PTR_ERR(dent);
+ }
+
+ /* Adjust the last dirent's record length
+ (Remember that dent still points to the last entry.) */
+
+ dent->de_rec_len = cpu_to_be16(be16_to_cpu(dent->de_rec_len) +
+ sizeof(struct gfs2_dinode) -
+ sizeof(struct gfs2_leaf));
+
+ brelse(bh);
+
+ /* We're done with the new leaf block, now setup the new
+ hash table. */
+
+ gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+ gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+
+ lp = (u64 *)(dibh->b_data + sizeof(struct gfs2_dinode));
+
+ for (x = sdp->sd_hash_ptrs; x--; lp++)
+ *lp = cpu_to_be64(bn);
+
+ dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
+ dip->i_di.di_blocks++;
+ dip->i_di.di_flags |= GFS2_DIF_EXHASH;
+ dip->i_di.di_payload_format = 0;
+
+ for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
+ dip->i_di.di_depth = y;
+
+ gfs2_dinode_out(&dip->i_di, dibh->b_data);
+
+ brelse(dibh);
+
+ return 0;
+}
+
+/**
+ * dir_split_leaf - Split a leaf block into two
+ * @dip: The GFS2 inode
+ * @index:
+ * @leaf_no:
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+static int dir_split_leaf(struct inode *inode, const struct qstr *name)
+{
+ struct gfs2_inode *dip = GFS2_I(inode);
+ struct buffer_head *nbh, *obh, *dibh;
+ struct gfs2_leaf *nleaf, *oleaf;
+ struct gfs2_dirent *dent = NULL, *prev = NULL, *next = NULL, *new;
+ u32 start, len, half_len, divider;
+ u64 bn, *lp, leaf_no;
+ u32 index;
+ int x, moved = 0;
+ int error;
+
+ index = name->hash >> (32 - dip->i_di.di_depth);
+ error = get_leaf_nr(dip, index, &leaf_no);
+ if (error)
+ return error;
+
+ /* Get the old leaf block */
+ error = get_leaf(dip, leaf_no, &obh);
+ if (error)
+ return error;
+
+ oleaf = (struct gfs2_leaf *)obh->b_data;
+ if (dip->i_di.di_depth == be16_to_cpu(oleaf->lf_depth)) {
+ brelse(obh);
+ return 1; /* can't split */
+ }
+
+ gfs2_trans_add_bh(dip->i_gl, obh, 1);
+
+ nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1);
+ if (!nleaf) {
+ brelse(obh);
+ return -ENOSPC;
+ }
+ bn = nbh->b_blocknr;
+
+ /* Compute the start and len of leaf pointers in the hash table. */
+ len = 1 << (dip->i_di.di_depth - be16_to_cpu(oleaf->lf_depth));
+ half_len = len >> 1;
+ if (!half_len) {
+ printk(KERN_WARNING "di_depth %u lf_depth %u index %u\n", dip->i_di.di_depth, be16_to_cpu(oleaf->lf_depth), index);
+ gfs2_consist_inode(dip);
+ error = -EIO;
+ goto fail_brelse;
+ }
+
+ start = (index & ~(len - 1));
+
+ /* Change the pointers.
+ Don't bother distinguishing stuffed from non-stuffed.
+ This code is complicated enough already. */
+ lp = kmalloc(half_len * sizeof(u64), GFP_NOFS | __GFP_NOFAIL);
+ /* Change the pointers */
+ for (x = 0; x < half_len; x++)
+ lp[x] = cpu_to_be64(bn);
+
+ error = gfs2_dir_write_data(dip, (char *)lp, start * sizeof(u64),
+ half_len * sizeof(u64));
+ if (error != half_len * sizeof(u64)) {
+ if (error >= 0)
+ error = -EIO;
+ goto fail_lpfree;
+ }
+
+ kfree(lp);
+
+ /* Compute the divider */
+ divider = (start + half_len) << (32 - dip->i_di.di_depth);
+
+ /* Copy the entries */
+ dirent_first(dip, obh, &dent);
+
+ do {
+ next = dent;
+ if (dirent_next(dip, obh, &next))
+ next = NULL;
+
+ if (dent->de_inum.no_addr &&
+ be32_to_cpu(dent->de_hash) < divider) {
+ struct qstr str;
+ str.name = (char*)(dent+1);
+ str.len = be16_to_cpu(dent->de_name_len);
+ str.hash = be32_to_cpu(dent->de_hash);
+ new = gfs2_dirent_alloc(inode, nbh, &str);
+ if (IS_ERR(new)) {
+ error = PTR_ERR(new);
+ break;
+ }
+
+ new->de_inum = dent->de_inum; /* No endian worries */
+ new->de_type = dent->de_type; /* No endian worries */
+ nleaf->lf_entries = cpu_to_be16(be16_to_cpu(nleaf->lf_entries)+1);
+
+ dirent_del(dip, obh, prev, dent);
+
+ if (!oleaf->lf_entries)
+ gfs2_consist_inode(dip);
+ oleaf->lf_entries = cpu_to_be16(be16_to_cpu(oleaf->lf_entries)-1);
+
+ if (!prev)
+ prev = dent;
+
+ moved = 1;
+ } else {
+ prev = dent;
+ }
+ dent = next;
+ } while (dent);
+
+ oleaf->lf_depth = nleaf->lf_depth;
+
+ error = gfs2_meta_inode_buffer(dip, &dibh);
+ if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
+ dip->i_di.di_blocks++;
+ gfs2_dinode_out(&dip->i_di, dibh->b_data);
+ brelse(dibh);
+ }
+
+ brelse(obh);
+ brelse(nbh);
+
+ return error;
+
+fail_lpfree:
+ kfree(lp);
+
+fail_brelse:
+ brelse(obh);
+ brelse(nbh);
+ return error;
+}
+
+/**
+ * dir_double_exhash - Double size of ExHash table
+ * @dip: The GFS2 dinode
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+static int dir_double_exhash(struct gfs2_inode *dip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+ struct buffer_head *dibh;
+ u32 hsize;
+ u64 *buf;
+ u64 *from, *to;
+ u64 block;
+ int x;
+ int error = 0;
+
+ hsize = 1 << dip->i_di.di_depth;
+ if (hsize * sizeof(u64) != dip->i_di.di_size) {
+ gfs2_consist_inode(dip);
+ return -EIO;
+ }
+
+ /* Allocate both the "from" and "to" buffers in one big chunk */
+
+ buf = kcalloc(3, sdp->sd_hash_bsize, GFP_KERNEL | __GFP_NOFAIL);
+
+ for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
+ error = gfs2_dir_read_data(dip, (char *)buf,
+ block * sdp->sd_hash_bsize,
+ sdp->sd_hash_bsize, 1);
+ if (error != sdp->sd_hash_bsize) {
+ if (error >= 0)
+ error = -EIO;
+ goto fail;
+ }
+
+ from = buf;
+ to = (u64 *)((char *)buf + sdp->sd_hash_bsize);
+
+ for (x = sdp->sd_hash_ptrs; x--; from++) {
+ *to++ = *from; /* No endianess worries */
+ *to++ = *from;
+ }
+
+ error = gfs2_dir_write_data(dip,
+ (char *)buf + sdp->sd_hash_bsize,
+ block * sdp->sd_sb.sb_bsize,
+ sdp->sd_sb.sb_bsize);
+ if (error != sdp->sd_sb.sb_bsize) {
+ if (error >= 0)
+ error = -EIO;
+ goto fail;
+ }
+ }
+
+ kfree(buf);
+
+ error = gfs2_meta_inode_buffer(dip, &dibh);
+ if (!gfs2_assert_withdraw(sdp, !error)) {
+ dip->i_di.di_depth++;
+ gfs2_dinode_out(&dip->i_di, dibh->b_data);
+ brelse(dibh);
+ }
+
+ return error;
+
+fail:
+ kfree(buf);
+ return error;
+}
+
+/**
+ * compare_dents - compare directory entries by hash value
+ * @a: first dent
+ * @b: second dent
+ *
+ * When comparing the hash entries of @a to @b:
+ * gt: returns 1
+ * lt: returns -1
+ * eq: returns 0
+ */
+
+static int compare_dents(const void *a, const void *b)
+{
+ const struct gfs2_dirent *dent_a, *dent_b;
+ u32 hash_a, hash_b;
+ int ret = 0;
+
+ dent_a = *(const struct gfs2_dirent **)a;
+ hash_a = be32_to_cpu(dent_a->de_hash);
+
+ dent_b = *(const struct gfs2_dirent **)b;
+ hash_b = be32_to_cpu(dent_b->de_hash);
+
+ if (hash_a > hash_b)
+ ret = 1;
+ else if (hash_a < hash_b)
+ ret = -1;
+ else {
+ unsigned int len_a = be16_to_cpu(dent_a->de_name_len);
+ unsigned int len_b = be16_to_cpu(dent_b->de_name_len);
+
+ if (len_a > len_b)
+ ret = 1;
+ else if (len_a < len_b)
+ ret = -1;
+ else
+ ret = memcmp(dent_a + 1, dent_b + 1, len_a);
+ }
+
+ return ret;
+}
+
+/**
+ * do_filldir_main - read out directory entries
+ * @dip: The GFS2 inode
+ * @offset: The offset in the file to read from
+ * @opaque: opaque data to pass to filldir
+ * @filldir: The function to pass entries to
+ * @darr: an array of struct gfs2_dirent pointers to read
+ * @entries: the number of entries in darr
+ * @copied: pointer to int that's non-zero if a entry has been copied out
+ *
+ * Jump through some hoops to make sure that if there are hash collsions,
+ * they are read out at the beginning of a buffer. We want to minimize
+ * the possibility that they will fall into different readdir buffers or
+ * that someone will want to seek to that location.
+ *
+ * Returns: errno, >0 on exception from filldir
+ */
+
+static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
+ void *opaque, gfs2_filldir_t filldir,
+ const struct gfs2_dirent **darr, u32 entries,
+ int *copied)
+{
+ const struct gfs2_dirent *dent, *dent_next;
+ struct gfs2_inum inum;
+ u64 off, off_next;
+ unsigned int x, y;
+ int run = 0;
+ int error = 0;
+
+ sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
+
+ dent_next = darr[0];
+ off_next = be32_to_cpu(dent_next->de_hash);
+ off_next = gfs2_disk_hash2offset(off_next);
+
+ for (x = 0, y = 1; x < entries; x++, y++) {
+ dent = dent_next;
+ off = off_next;
+
+ if (y < entries) {
+ dent_next = darr[y];
+ off_next = be32_to_cpu(dent_next->de_hash);
+ off_next = gfs2_disk_hash2offset(off_next);
+
+ if (off < *offset)
+ continue;
+ *offset = off;
+
+ if (off_next == off) {
+ if (*copied && !run)
+ return 1;
+ run = 1;
+ } else
+ run = 0;
+ } else {
+ if (off < *offset)
+ continue;
+ *offset = off;
+ }
+
+ gfs2_inum_in(&inum, (char *)&dent->de_inum);
+
+ error = filldir(opaque, (const char *)(dent + 1),
+ be16_to_cpu(dent->de_name_len),
+ off, &inum,
+ be16_to_cpu(dent->de_type));
+ if (error)
+ return 1;
+
+ *copied = 1;
+ }
+
+ /* Increment the *offset by one, so the next time we come into the
+ do_filldir fxn, we get the next entry instead of the last one in the
+ current leaf */
+
+ (*offset)++;
+
+ return 0;
+}
+
+static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
+ gfs2_filldir_t filldir, int *copied,
+ unsigned *depth, u64 leaf_no)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct buffer_head *bh;
+ struct gfs2_leaf *lf;
+ unsigned entries = 0;
+ unsigned leaves = 0;
+ const struct gfs2_dirent **darr, *dent;
+ struct dirent_gather g;
+ struct buffer_head **larr;
+ int leaf = 0;
+ int error, i;
+ u64 lfn = leaf_no;
+
+ do {
+ error = get_leaf(ip, lfn, &bh);
+ if (error)
+ goto out;
+ lf = (struct gfs2_leaf *)bh->b_data;
+ if (leaves == 0)
+ *depth = be16_to_cpu(lf->lf_depth);
+ entries += be16_to_cpu(lf->lf_entries);
+ leaves++;
+ lfn = be64_to_cpu(lf->lf_next);
+ brelse(bh);
+ } while(lfn);
+
+ if (!entries)
+ return 0;
+
+ error = -ENOMEM;
+ larr = vmalloc((leaves + entries) * sizeof(void *));
+ if (!larr)
+ goto out;
+ darr = (const struct gfs2_dirent **)(larr + leaves);
+ g.pdent = darr;
+ g.offset = 0;
+ lfn = leaf_no;
+
+ do {
+ error = get_leaf(ip, lfn, &bh);
+ if (error)
+ goto out_kfree;
+ lf = (struct gfs2_leaf *)bh->b_data;
+ lfn = be64_to_cpu(lf->lf_next);
+ if (lf->lf_entries) {
+ dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
+ gfs2_dirent_gather, NULL, &g);
+ error = PTR_ERR(dent);
+ if (IS_ERR(dent)) {
+ goto out_kfree;
+ }
+ error = 0;
+ larr[leaf++] = bh;
+ } else {
+ brelse(bh);
+ }
+ } while(lfn);
+
+ error = do_filldir_main(ip, offset, opaque, filldir, darr,
+ entries, copied);
+out_kfree:
+ for(i = 0; i < leaf; i++)
+ brelse(larr[i]);
+ vfree(larr);
+out:
+ return error;
+}
+
+/**
+ * dir_e_read - Reads the entries from a directory into a filldir buffer
+ * @dip: dinode pointer
+ * @offset: the hash of the last entry read shifted to the right once
+ * @opaque: buffer for the filldir function to fill
+ * @filldir: points to the filldir function to use
+ *
+ * Returns: errno
+ */
+
+static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
+ gfs2_filldir_t filldir)
+{
+ struct gfs2_inode *dip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ u32 hsize, len = 0;
+ u32 ht_offset, lp_offset, ht_offset_cur = -1;
+ u32 hash, index;
+ u64 *lp;
+ int copied = 0;
+ int error = 0;
+ unsigned depth = 0;
+
+ hsize = 1 << dip->i_di.di_depth;
+ if (hsize * sizeof(u64) != dip->i_di.di_size) {
+ gfs2_consist_inode(dip);
+ return -EIO;
+ }
+
+ hash = gfs2_dir_offset2hash(*offset);
+ index = hash >> (32 - dip->i_di.di_depth);
+
+ lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
+ if (!lp)
+ return -ENOMEM;
+
+ while (index < hsize) {
+ lp_offset = index & (sdp->sd_hash_ptrs - 1);
+ ht_offset = index - lp_offset;
+
+ if (ht_offset_cur != ht_offset) {
+ error = gfs2_dir_read_data(dip, (char *)lp,
+ ht_offset * sizeof(u64),
+ sdp->sd_hash_bsize, 1);
+ if (error != sdp->sd_hash_bsize) {
+ if (error >= 0)
+ error = -EIO;
+ goto out;
+ }
+ ht_offset_cur = ht_offset;
+ }
+
+ error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
+ &copied, &depth,
+ be64_to_cpu(lp[lp_offset]));
+ if (error)
+ break;
+
+ len = 1 << (dip->i_di.di_depth - depth);
+ index = (index & ~(len - 1)) + len;
+ }
+
+out:
+ kfree(lp);
+ if (error > 0)
+ error = 0;
+ return error;
+}
+
+int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
+ gfs2_filldir_t filldir)
+{
+ struct gfs2_inode *dip = GFS2_I(inode);
+ struct dirent_gather g;
+ const struct gfs2_dirent **darr, *dent;
+ struct buffer_head *dibh;
+ int copied = 0;
+ int error;
+
+ if (!dip->i_di.di_entries)
+ return 0;
+
+ if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
+ return dir_e_read(inode, offset, opaque, filldir);
+
+ if (!gfs2_is_stuffed(dip)) {
+ gfs2_consist_inode(dip);
+ return -EIO;
+ }
+
+ error = gfs2_meta_inode_buffer(dip, &dibh);
+ if (error)
+ return error;
+
+ error = -ENOMEM;
+ darr = kmalloc(dip->i_di.di_entries * sizeof(struct gfs2_dirent *),
+ GFP_KERNEL);
+ if (darr) {
+ g.pdent = darr;
+ g.offset = 0;
+ dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size,
+ gfs2_dirent_gather, NULL, &g);
+ if (IS_ERR(dent)) {
+ error = PTR_ERR(dent);
+ goto out;
+ }
+ error = do_filldir_main(dip, offset, opaque, filldir, darr,
+ dip->i_di.di_entries, &copied);
+out:
+ kfree(darr);
+ }
+
+ if (error > 0)
+ error = 0;
+
+ brelse(dibh);
+
+ return error;
+}
+
+/**
+ * gfs2_dir_search - Search a directory
+ * @dip: The GFS2 inode
+ * @filename:
+ * @inode:
+ *
+ * This routine searches a directory for a file or another directory.
+ * Assumes a glock is held on dip.
+ *
+ * Returns: errno
+ */
+
+int gfs2_dir_search(struct inode *dir, const struct qstr *name,
+ struct gfs2_inum *inum, unsigned int *type)
+{
+ struct buffer_head *bh;
+ struct gfs2_dirent *dent;
+
+ dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
+ if (dent) {
+ if (IS_ERR(dent))
+ return PTR_ERR(dent);
+ if (inum)
+ gfs2_inum_in(inum, (char *)&dent->de_inum);
+ if (type)
+ *type = be16_to_cpu(dent->de_type);
+ brelse(bh);
+ return 0;
+ }
+ return -ENOENT;
+}
+
+static int dir_new_leaf(struct inode *inode, const struct qstr *name)
+{
+ struct buffer_head *bh, *obh;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_leaf *leaf, *oleaf;
+ int error;
+ u32 index;
+ u64 bn;
+
+ index = name->hash >> (32 - ip->i_di.di_depth);
+ error = get_first_leaf(ip, index, &obh);
+ if (error)
+ return error;
+ do {
+ oleaf = (struct gfs2_leaf *)obh->b_data;
+ bn = be64_to_cpu(oleaf->lf_next);
+ if (!bn)
+ break;
+ brelse(obh);
+ error = get_leaf(ip, bn, &obh);
+ if (error)
+ return error;
+ } while(1);
+
+ gfs2_trans_add_bh(ip->i_gl, obh, 1);
+
+ leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth));
+ if (!leaf) {
+ brelse(obh);
+ return -ENOSPC;
+ }
+ oleaf->lf_next = cpu_to_be64(bh->b_blocknr);
+ brelse(bh);
+ brelse(obh);
+
+ error = gfs2_meta_inode_buffer(ip, &bh);
+ if (error)
+ return error;
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ ip->i_di.di_blocks++;
+ gfs2_dinode_out(&ip->i_di, bh->b_data);
+ brelse(bh);
+ return 0;
+}
+
+/**
+ * gfs2_dir_add - Add new filename into directory
+ * @dip: The GFS2 inode
+ * @filename: The new name
+ * @inode: The inode number of the entry
+ * @type: The type of the entry
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+int gfs2_dir_add(struct inode *inode, const struct qstr *name,
+ const struct gfs2_inum *inum, unsigned type)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct buffer_head *bh;
+ struct gfs2_dirent *dent;
+ struct gfs2_leaf *leaf;
+ int error;
+
+ while(1) {
+ dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space,
+ &bh);
+ if (dent) {
+ if (IS_ERR(dent))
+ return PTR_ERR(dent);
+ dent = gfs2_init_dirent(inode, dent, name, bh);
+ gfs2_inum_out(inum, (char *)&dent->de_inum);
+ dent->de_type = cpu_to_be16(type);
+ if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
+ leaf = (struct gfs2_leaf *)bh->b_data;
+ leaf->lf_entries = cpu_to_be16(be16_to_cpu(leaf->lf_entries) + 1);
+ }
+ brelse(bh);
+ error = gfs2_meta_inode_buffer(ip, &bh);
+ if (error)
+ break;
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ ip->i_di.di_entries++;
+ ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+ gfs2_dinode_out(&ip->i_di, bh->b_data);
+ brelse(bh);
+ error = 0;
+ break;
+ }
+ if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
+ error = dir_make_exhash(inode);
+ if (error)
+ break;
+ continue;
+ }
+ error = dir_split_leaf(inode, name);
+ if (error == 0)
+ continue;
+ if (error < 0)
+ break;
+ if (ip->i_di.di_depth < GFS2_DIR_MAX_DEPTH) {
+ error = dir_double_exhash(ip);
+ if (error)
+ break;
+ error = dir_split_leaf(inode, name);
+ if (error < 0)
+ break;
+ if (error == 0)
+ continue;
+ }
+ error = dir_new_leaf(inode, name);
+ if (!error)
+ continue;
+ error = -ENOSPC;
+ break;
+ }
+ return error;
+}
+
+
+/**
+ * gfs2_dir_del - Delete a directory entry
+ * @dip: The GFS2 inode
+ * @filename: The filename
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
+{
+ struct gfs2_dirent *dent, *prev = NULL;
+ struct buffer_head *bh;
+ int error;
+
+ /* Returns _either_ the entry (if its first in block) or the
+ previous entry otherwise */
+ dent = gfs2_dirent_search(&dip->i_inode, name, gfs2_dirent_prev, &bh);
+ if (!dent) {
+ gfs2_consist_inode(dip);
+ return -EIO;
+ }
+ if (IS_ERR(dent)) {
+ gfs2_consist_inode(dip);
+ return PTR_ERR(dent);
+ }
+ /* If not first in block, adjust pointers accordingly */
+ if (gfs2_dirent_find(dent, name, NULL) == 0) {
+ prev = dent;
+ dent = (struct gfs2_dirent *)((char *)dent + be16_to_cpu(prev->de_rec_len));
+ }
+
+ dirent_del(dip, bh, prev, dent);
+ if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
+ struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
+ u16 entries = be16_to_cpu(leaf->lf_entries);
+ if (!entries)
+ gfs2_consist_inode(dip);
+ leaf->lf_entries = cpu_to_be16(--entries);
+ }
+ brelse(bh);
+
+ error = gfs2_meta_inode_buffer(dip, &bh);
+ if (error)
+ return error;
+
+ if (!dip->i_di.di_entries)
+ gfs2_consist_inode(dip);
+ gfs2_trans_add_bh(dip->i_gl, bh, 1);
+ dip->i_di.di_entries--;
+ dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
+ gfs2_dinode_out(&dip->i_di, bh->b_data);
+ brelse(bh);
+ mark_inode_dirty(&dip->i_inode);
+
+ return error;
+}
+
+/**
+ * gfs2_dir_mvino - Change inode number of directory entry
+ * @dip: The GFS2 inode
+ * @filename:
+ * @new_inode:
+ *
+ * This routine changes the inode number of a directory entry. It's used
+ * by rename to change ".." when a directory is moved.
+ * Assumes a glock is held on dvp.
+ *
+ * Returns: errno
+ */
+
+int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
+ struct gfs2_inum *inum, unsigned int new_type)
+{
+ struct buffer_head *bh;
+ struct gfs2_dirent *dent;
+ int error;
+
+ dent = gfs2_dirent_search(&dip->i_inode, filename, gfs2_dirent_find, &bh);
+ if (!dent) {
+ gfs2_consist_inode(dip);
+ return -EIO;
+ }
+ if (IS_ERR(dent))
+ return PTR_ERR(dent);
+
+ gfs2_trans_add_bh(dip->i_gl, bh, 1);
+ gfs2_inum_out(inum, (char *)&dent->de_inum);
+ dent->de_type = cpu_to_be16(new_type);
+
+ if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
+ brelse(bh);
+ error = gfs2_meta_inode_buffer(dip, &bh);
+ if (error)
+ return error;
+ gfs2_trans_add_bh(dip->i_gl, bh, 1);
+ }
+
+ dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
+ gfs2_dinode_out(&dip->i_di, bh->b_data);
+ brelse(bh);
+ return 0;
+}
+
+/**
+ * foreach_leaf - call a function for each leaf in a directory
+ * @dip: the directory
+ * @lc: the function to call for each each
+ * @data: private data to pass to it
+ *
+ * Returns: errno
+ */
+
+static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+ struct buffer_head *bh;
+ struct gfs2_leaf *leaf;
+ u32 hsize, len;
+ u32 ht_offset, lp_offset, ht_offset_cur = -1;
+ u32 index = 0;
+ u64 *lp;
+ u64 leaf_no;
+ int error = 0;
+
+ hsize = 1 << dip->i_di.di_depth;
+ if (hsize * sizeof(u64) != dip->i_di.di_size) {
+ gfs2_consist_inode(dip);
+ return -EIO;
+ }
+
+ lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
+ if (!lp)
+ return -ENOMEM;
+
+ while (index < hsize) {
+ lp_offset = index & (sdp->sd_hash_ptrs - 1);
+ ht_offset = index - lp_offset;
+
+ if (ht_offset_cur != ht_offset) {
+ error = gfs2_dir_read_data(dip, (char *)lp,
+ ht_offset * sizeof(u64),
+ sdp->sd_hash_bsize, 1);
+ if (error != sdp->sd_hash_bsize) {
+ if (error >= 0)
+ error = -EIO;
+ goto out;
+ }
+ ht_offset_cur = ht_offset;
+ }
+
+ leaf_no = be64_to_cpu(lp[lp_offset]);
+ if (leaf_no) {
+ error = get_leaf(dip, leaf_no, &bh);
+ if (error)
+ goto out;
+ leaf = (struct gfs2_leaf *)bh->b_data;
+ len = 1 << (dip->i_di.di_depth - be16_to_cpu(leaf->lf_depth));
+ brelse(bh);
+
+ error = lc(dip, index, len, leaf_no, data);
+ if (error)
+ goto out;
+
+ index = (index & ~(len - 1)) + len;
+ } else
+ index++;
+ }
+
+ if (index != hsize) {
+ gfs2_consist_inode(dip);
+ error = -EIO;
+ }
+
+out:
+ kfree(lp);
+
+ return error;
+}
+
+/**
+ * leaf_dealloc - Deallocate a directory leaf
+ * @dip: the directory
+ * @index: the hash table offset in the directory
+ * @len: the number of pointers to this leaf
+ * @leaf_no: the leaf number
+ * @data: not used
+ *
+ * Returns: errno
+ */
+
+static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
+ u64 leaf_no, void *data)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+ struct gfs2_leaf *tmp_leaf;
+ struct gfs2_rgrp_list rlist;
+ struct buffer_head *bh, *dibh;
+ u64 blk, nblk;
+ unsigned int rg_blocks = 0, l_blocks = 0;
+ char *ht;
+ unsigned int x, size = len * sizeof(u64);
+ int error;
+
+ memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
+
+ ht = kzalloc(size, GFP_KERNEL);
+ if (!ht)
+ return -ENOMEM;
+
+ gfs2_alloc_get(dip);
+
+ error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ if (error)
+ goto out;
+
+ error = gfs2_rindex_hold(sdp, &dip->i_alloc.al_ri_gh);
+ if (error)
+ goto out_qs;
+
+ /* Count the number of leaves */
+
+ for (blk = leaf_no; blk; blk = nblk) {
+ error = get_leaf(dip, blk, &bh);
+ if (error)
+ goto out_rlist;
+ tmp_leaf = (struct gfs2_leaf *)bh->b_data;
+ nblk = be64_to_cpu(tmp_leaf->lf_next);
+ brelse(bh);
+
+ gfs2_rlist_add(sdp, &rlist, blk);
+ l_blocks++;
+ }
+
+ gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+
+ for (x = 0; x < rlist.rl_rgrps; x++) {
+ struct gfs2_rgrpd *rgd;
+ rgd = rlist.rl_ghs[x].gh_gl->gl_object;
+ rg_blocks += rgd->rd_ri.ri_length;
+ }
+
+ error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
+ if (error)
+ goto out_rlist;
+
+ error = gfs2_trans_begin(sdp,
+ rg_blocks + (DIV_ROUND_UP(size, sdp->sd_jbsize) + 1) +
+ RES_DINODE + RES_STATFS + RES_QUOTA, l_blocks);
+ if (error)
+ goto out_rg_gunlock;
+
+ for (blk = leaf_no; blk; blk = nblk) {
+ error = get_leaf(dip, blk, &bh);
+ if (error)
+ goto out_end_trans;
+ tmp_leaf = (struct gfs2_leaf *)bh->b_data;
+ nblk = be64_to_cpu(tmp_leaf->lf_next);
+ brelse(bh);
+
+ gfs2_free_meta(dip, blk, 1);
+
+ if (!dip->i_di.di_blocks)
+ gfs2_consist_inode(dip);
+ dip->i_di.di_blocks--;
+ }
+
+ error = gfs2_dir_write_data(dip, ht, index * sizeof(u64), size);
+ if (error != size) {
+ if (error >= 0)
+ error = -EIO;
+ goto out_end_trans;
+ }
+
+ error = gfs2_meta_inode_buffer(dip, &dibh);
+ if (error)
+ goto out_end_trans;
+
+ gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+ gfs2_dinode_out(&dip->i_di, dibh->b_data);
+ brelse(dibh);
+
+out_end_trans:
+ gfs2_trans_end(sdp);
+out_rg_gunlock:
+ gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
+out_rlist:
+ gfs2_rlist_free(&rlist);
+ gfs2_glock_dq_uninit(&dip->i_alloc.al_ri_gh);
+out_qs:
+ gfs2_quota_unhold(dip);
+out:
+ gfs2_alloc_put(dip);
+ kfree(ht);
+ return error;
+}
+
+/**
+ * gfs2_dir_exhash_dealloc - free all the leaf blocks in a directory
+ * @dip: the directory
+ *
+ * Dealloc all on-disk directory leaves to FREEMETA state
+ * Change on-disk inode type to "regular file"
+ *
+ * Returns: errno
+ */
+
+int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+ struct buffer_head *bh;
+ int error;
+
+ /* Dealloc on-disk leaves to FREEMETA state */
+ error = foreach_leaf(dip, leaf_dealloc, NULL);
+ if (error)
+ return error;
+
+ /* Make this a regular file in case we crash.
+ (We don't want to free these blocks a second time.) */
+
+ error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+ if (error)
+ return error;
+
+ error = gfs2_meta_inode_buffer(dip, &bh);
+ if (!error) {
+ gfs2_trans_add_bh(dip->i_gl, bh, 1);
+ ((struct gfs2_dinode *)bh->b_data)->di_mode =
+ cpu_to_be32(S_IFREG);
+ brelse(bh);
+ }
+
+ gfs2_trans_end(sdp);
+
+ return error;
+}
+
+/**
+ * gfs2_diradd_alloc_required - find if adding entry will require an allocation
+ * @ip: the file being written to
+ * @filname: the filename that's going to be added
+ *
+ * Returns: 1 if alloc required, 0 if not, -ve on error
+ */
+
+int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name)
+{
+ struct gfs2_dirent *dent;
+ struct buffer_head *bh;
+
+ dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
+ if (!dent) {
+ return 1;
+ }
+ if (IS_ERR(dent))
+ return PTR_ERR(dent);
+ brelse(bh);
+ return 0;
+}
+
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
new file mode 100644
index 000000000000..371233419b07
--- /dev/null
+++ b/fs/gfs2/dir.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __DIR_DOT_H__
+#define __DIR_DOT_H__
+
+#include <linux/dcache.h>
+
+struct inode;
+struct gfs2_inode;
+struct gfs2_inum;
+
+/**
+ * gfs2_filldir_t - Report a directory entry to the caller of gfs2_dir_read()
+ * @opaque: opaque data used by the function
+ * @name: the name of the directory entry
+ * @length: the length of the name
+ * @offset: the entry's offset in the directory
+ * @inum: the inode number the entry points to
+ * @type: the type of inode the entry points to
+ *
+ * Returns: 0 on success, 1 if buffer full
+ */
+
+typedef int (*gfs2_filldir_t) (void *opaque,
+ const char *name, unsigned int length,
+ u64 offset,
+ struct gfs2_inum *inum, unsigned int type);
+
+int gfs2_dir_search(struct inode *dir, const struct qstr *filename,
+ struct gfs2_inum *inum, unsigned int *type);
+int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
+ const struct gfs2_inum *inum, unsigned int type);
+int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
+int gfs2_dir_read(struct inode *inode, u64 * offset, void *opaque,
+ gfs2_filldir_t filldir);
+int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
+ struct gfs2_inum *new_inum, unsigned int new_type);
+
+int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
+
+int gfs2_diradd_alloc_required(struct inode *dir,
+ const struct qstr *filename);
+int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
+ struct buffer_head **bhp);
+
+static inline u32 gfs2_disk_hash(const char *data, int len)
+{
+ return crc32_le((u32)~0, data, len) ^ (u32)~0;
+}
+
+
+static inline void gfs2_str2qstr(struct qstr *name, const char *fname)
+{
+ name->name = fname;
+ name->len = strlen(fname);
+ name->hash = gfs2_disk_hash(name->name, name->len);
+}
+
+/* N.B. This probably ought to take inum & type as args as well */
+static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct gfs2_dirent *dent)
+{
+ dent->de_inum.no_addr = cpu_to_be64(0);
+ dent->de_inum.no_formal_ino = cpu_to_be64(0);
+ dent->de_hash = cpu_to_be32(name->hash);
+ dent->de_rec_len = cpu_to_be16(reclen);
+ dent->de_name_len = cpu_to_be16(name->len);
+ dent->de_type = cpu_to_be16(0);
+ memset(dent->__pad, 0, sizeof(dent->__pad));
+ memcpy(dent + 1, name->name, name->len);
+}
+
+#endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
new file mode 100644
index 000000000000..92c54e9b0dc3
--- /dev/null
+++ b/fs/gfs2/eaops.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/xattr.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "eaops.h"
+#include "eattr.h"
+#include "util.h"
+
+/**
+ * gfs2_ea_name2type - get the type of the ea, and truncate type from the name
+ * @namep: ea name, possibly with type appended
+ *
+ * Returns: GFS2_EATYPE_XXX
+ */
+
+unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name)
+{
+ unsigned int type;
+
+ if (strncmp(name, "system.", 7) == 0) {
+ type = GFS2_EATYPE_SYS;
+ if (truncated_name)
+ *truncated_name = name + sizeof("system.") - 1;
+ } else if (strncmp(name, "user.", 5) == 0) {
+ type = GFS2_EATYPE_USR;
+ if (truncated_name)
+ *truncated_name = name + sizeof("user.") - 1;
+ } else if (strncmp(name, "security.", 9) == 0) {
+ type = GFS2_EATYPE_SECURITY;
+ if (truncated_name)
+ *truncated_name = name + sizeof("security.") - 1;
+ } else {
+ type = GFS2_EATYPE_UNUSED;
+ if (truncated_name)
+ *truncated_name = NULL;
+ }
+
+ return type;
+}
+
+static int user_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ struct inode *inode = &ip->i_inode;
+ int error = permission(inode, MAY_READ, NULL);
+ if (error)
+ return error;
+
+ return gfs2_ea_get_i(ip, er);
+}
+
+static int user_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ struct inode *inode = &ip->i_inode;
+
+ if (S_ISREG(inode->i_mode) ||
+ (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
+ int error = permission(inode, MAY_WRITE, NULL);
+ if (error)
+ return error;
+ } else
+ return -EPERM;
+
+ return gfs2_ea_set_i(ip, er);
+}
+
+static int user_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ struct inode *inode = &ip->i_inode;
+
+ if (S_ISREG(inode->i_mode) ||
+ (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
+ int error = permission(inode, MAY_WRITE, NULL);
+ if (error)
+ return error;
+ } else
+ return -EPERM;
+
+ return gfs2_ea_remove_i(ip, er);
+}
+
+static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
+ !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) &&
+ !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl == 0 &&
+ (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) ||
+ GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
+ return -EOPNOTSUPP;
+
+
+
+ return gfs2_ea_get_i(ip, er);
+}
+
+static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ int remove = 0;
+ int error;
+
+ if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
+ if (!(er->er_flags & GFS2_ERF_MODE)) {
+ er->er_mode = ip->i_di.di_mode;
+ er->er_flags |= GFS2_ERF_MODE;
+ }
+ error = gfs2_acl_validate_set(ip, 1, er,
+ &remove, &er->er_mode);
+ if (error)
+ return error;
+ error = gfs2_ea_set_i(ip, er);
+ if (error)
+ return error;
+ if (remove)
+ gfs2_ea_remove_i(ip, er);
+ return 0;
+
+ } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
+ error = gfs2_acl_validate_set(ip, 0, er,
+ &remove, NULL);
+ if (error)
+ return error;
+ if (!remove)
+ error = gfs2_ea_set_i(ip, er);
+ else {
+ error = gfs2_ea_remove_i(ip, er);
+ if (error == -ENODATA)
+ error = 0;
+ }
+ return error;
+ }
+
+ return -EPERM;
+}
+
+static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
+ int error = gfs2_acl_validate_remove(ip, 1);
+ if (error)
+ return error;
+
+ } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
+ int error = gfs2_acl_validate_remove(ip, 0);
+ if (error)
+ return error;
+
+ } else
+ return -EPERM;
+
+ return gfs2_ea_remove_i(ip, er);
+}
+
+static int security_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ struct inode *inode = &ip->i_inode;
+ int error = permission(inode, MAY_READ, NULL);
+ if (error)
+ return error;
+
+ return gfs2_ea_get_i(ip, er);
+}
+
+static int security_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ struct inode *inode = &ip->i_inode;
+ int error = permission(inode, MAY_WRITE, NULL);
+ if (error)
+ return error;
+
+ return gfs2_ea_set_i(ip, er);
+}
+
+static int security_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ struct inode *inode = &ip->i_inode;
+ int error = permission(inode, MAY_WRITE, NULL);
+ if (error)
+ return error;
+
+ return gfs2_ea_remove_i(ip, er);
+}
+
+static struct gfs2_eattr_operations gfs2_user_eaops = {
+ .eo_get = user_eo_get,
+ .eo_set = user_eo_set,
+ .eo_remove = user_eo_remove,
+ .eo_name = "user",
+};
+
+struct gfs2_eattr_operations gfs2_system_eaops = {
+ .eo_get = system_eo_get,
+ .eo_set = system_eo_set,
+ .eo_remove = system_eo_remove,
+ .eo_name = "system",
+};
+
+static struct gfs2_eattr_operations gfs2_security_eaops = {
+ .eo_get = security_eo_get,
+ .eo_set = security_eo_set,
+ .eo_remove = security_eo_remove,
+ .eo_name = "security",
+};
+
+struct gfs2_eattr_operations *gfs2_ea_ops[] = {
+ NULL,
+ &gfs2_user_eaops,
+ &gfs2_system_eaops,
+ &gfs2_security_eaops,
+};
+
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
new file mode 100644
index 000000000000..508b4f7a2449
--- /dev/null
+++ b/fs/gfs2/eaops.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __EAOPS_DOT_H__
+#define __EAOPS_DOT_H__
+
+struct gfs2_ea_request;
+struct gfs2_inode;
+
+struct gfs2_eattr_operations {
+ int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
+ int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
+ int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
+ char *eo_name;
+};
+
+unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name);
+
+extern struct gfs2_eattr_operations gfs2_system_eaops;
+
+extern struct gfs2_eattr_operations *gfs2_ea_ops[];
+
+#endif /* __EAOPS_DOT_H__ */
+
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
new file mode 100644
index 000000000000..a65a4ccfd4dd
--- /dev/null
+++ b/fs/gfs2/eattr.c
@@ -0,0 +1,1501 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/xattr.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "eaops.h"
+#include "eattr.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+
+/**
+ * ea_calc_size - returns the acutal number of bytes the request will take up
+ * (not counting any unstuffed data blocks)
+ * @sdp:
+ * @er:
+ * @size:
+ *
+ * Returns: 1 if the EA should be stuffed
+ */
+
+static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er,
+ unsigned int *size)
+{
+ *size = GFS2_EAREQ_SIZE_STUFFED(er);
+ if (*size <= sdp->sd_jbsize)
+ return 1;
+
+ *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er);
+
+ return 0;
+}
+
+static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er)
+{
+ unsigned int size;
+
+ if (er->er_data_len > GFS2_EA_MAX_DATA_LEN)
+ return -ERANGE;
+
+ ea_calc_size(sdp, er, &size);
+
+ /* This can only happen with 512 byte blocks */
+ if (size > sdp->sd_jbsize)
+ return -ERANGE;
+
+ return 0;
+}
+
+typedef int (*ea_call_t) (struct gfs2_inode *ip, struct buffer_head *bh,
+ struct gfs2_ea_header *ea,
+ struct gfs2_ea_header *prev, void *private);
+
+static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh,
+ ea_call_t ea_call, void *data)
+{
+ struct gfs2_ea_header *ea, *prev = NULL;
+ int error = 0;
+
+ if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_EA))
+ return -EIO;
+
+ for (ea = GFS2_EA_BH2FIRST(bh);; prev = ea, ea = GFS2_EA2NEXT(ea)) {
+ if (!GFS2_EA_REC_LEN(ea))
+ goto fail;
+ if (!(bh->b_data <= (char *)ea && (char *)GFS2_EA2NEXT(ea) <=
+ bh->b_data + bh->b_size))
+ goto fail;
+ if (!GFS2_EATYPE_VALID(ea->ea_type))
+ goto fail;
+
+ error = ea_call(ip, bh, ea, prev, data);
+ if (error)
+ return error;
+
+ if (GFS2_EA_IS_LAST(ea)) {
+ if ((char *)GFS2_EA2NEXT(ea) !=
+ bh->b_data + bh->b_size)
+ goto fail;
+ break;
+ }
+ }
+
+ return error;
+
+fail:
+ gfs2_consist_inode(ip);
+ return -EIO;
+}
+
+static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
+{
+ struct buffer_head *bh, *eabh;
+ u64 *eablk, *end;
+ int error;
+
+ error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &bh);
+ if (error)
+ return error;
+
+ if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) {
+ error = ea_foreach_i(ip, bh, ea_call, data);
+ goto out;
+ }
+
+ if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_IN)) {
+ error = -EIO;
+ goto out;
+ }
+
+ eablk = (u64 *)(bh->b_data + sizeof(struct gfs2_meta_header));
+ end = eablk + GFS2_SB(&ip->i_inode)->sd_inptrs;
+
+ for (; eablk < end; eablk++) {
+ u64 bn;
+
+ if (!*eablk)
+ break;
+ bn = be64_to_cpu(*eablk);
+
+ error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, &eabh);
+ if (error)
+ break;
+ error = ea_foreach_i(ip, eabh, ea_call, data);
+ brelse(eabh);
+ if (error)
+ break;
+ }
+out:
+ brelse(bh);
+ return error;
+}
+
+struct ea_find {
+ struct gfs2_ea_request *ef_er;
+ struct gfs2_ea_location *ef_el;
+};
+
+static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
+ struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+ void *private)
+{
+ struct ea_find *ef = private;
+ struct gfs2_ea_request *er = ef->ef_er;
+
+ if (ea->ea_type == GFS2_EATYPE_UNUSED)
+ return 0;
+
+ if (ea->ea_type == er->er_type) {
+ if (ea->ea_name_len == er->er_name_len &&
+ !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) {
+ struct gfs2_ea_location *el = ef->ef_el;
+ get_bh(bh);
+ el->el_bh = bh;
+ el->el_ea = ea;
+ el->el_prev = prev;
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+ struct gfs2_ea_location *el)
+{
+ struct ea_find ef;
+ int error;
+
+ ef.ef_er = er;
+ ef.ef_el = el;
+
+ memset(el, 0, sizeof(struct gfs2_ea_location));
+
+ error = ea_foreach(ip, ea_find_i, &ef);
+ if (error > 0)
+ return 0;
+
+ return error;
+}
+
+/**
+ * ea_dealloc_unstuffed -
+ * @ip:
+ * @bh:
+ * @ea:
+ * @prev:
+ * @private:
+ *
+ * Take advantage of the fact that all unstuffed blocks are
+ * allocated from the same RG. But watch, this may not always
+ * be true.
+ *
+ * Returns: errno
+ */
+
+static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
+ struct gfs2_ea_header *ea,
+ struct gfs2_ea_header *prev, void *private)
+{
+ int *leave = private;
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_rgrpd *rgd;
+ struct gfs2_holder rg_gh;
+ struct buffer_head *dibh;
+ u64 *dataptrs, bn = 0;
+ u64 bstart = 0;
+ unsigned int blen = 0;
+ unsigned int blks = 0;
+ unsigned int x;
+ int error;
+
+ if (GFS2_EA_IS_STUFFED(ea))
+ return 0;
+
+ dataptrs = GFS2_EA2DATAPTRS(ea);
+ for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
+ if (*dataptrs) {
+ blks++;
+ bn = be64_to_cpu(*dataptrs);
+ }
+ }
+ if (!blks)
+ return 0;
+
+ rgd = gfs2_blk2rgrpd(sdp, bn);
+ if (!rgd) {
+ gfs2_consist_inode(ip);
+ return -EIO;
+ }
+
+ error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
+ if (error)
+ return error;
+
+ error = gfs2_trans_begin(sdp, rgd->rd_ri.ri_length + RES_DINODE +
+ RES_EATTR + RES_STATFS + RES_QUOTA, blks);
+ if (error)
+ goto out_gunlock;
+
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+
+ dataptrs = GFS2_EA2DATAPTRS(ea);
+ for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
+ if (!*dataptrs)
+ break;
+ bn = be64_to_cpu(*dataptrs);
+
+ if (bstart + blen == bn)
+ blen++;
+ else {
+ if (bstart)
+ gfs2_free_meta(ip, bstart, blen);
+ bstart = bn;
+ blen = 1;
+ }
+
+ *dataptrs = 0;
+ if (!ip->i_di.di_blocks)
+ gfs2_consist_inode(ip);
+ ip->i_di.di_blocks--;
+ }
+ if (bstart)
+ gfs2_free_meta(ip, bstart, blen);
+
+ if (prev && !leave) {
+ u32 len;
+
+ len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+ prev->ea_rec_len = cpu_to_be32(len);
+
+ if (GFS2_EA_IS_LAST(ea))
+ prev->ea_flags |= GFS2_EAFLAG_LAST;
+ } else {
+ ea->ea_type = GFS2_EATYPE_UNUSED;
+ ea->ea_num_ptrs = 0;
+ }
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (!error) {
+ ip->i_di.di_ctime = get_seconds();
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+ }
+
+ gfs2_trans_end(sdp);
+
+out_gunlock:
+ gfs2_glock_dq_uninit(&rg_gh);
+ return error;
+}
+
+static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
+ struct gfs2_ea_header *ea,
+ struct gfs2_ea_header *prev, int leave)
+{
+ struct gfs2_alloc *al;
+ int error;
+
+ al = gfs2_alloc_get(ip);
+
+ error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ if (error)
+ goto out_alloc;
+
+ error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
+ if (error)
+ goto out_quota;
+
+ error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL);
+
+ gfs2_glock_dq_uninit(&al->al_ri_gh);
+
+out_quota:
+ gfs2_quota_unhold(ip);
+out_alloc:
+ gfs2_alloc_put(ip);
+ return error;
+}
+
+struct ea_list {
+ struct gfs2_ea_request *ei_er;
+ unsigned int ei_size;
+};
+
+static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
+ struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+ void *private)
+{
+ struct ea_list *ei = private;
+ struct gfs2_ea_request *er = ei->ei_er;
+ unsigned int ea_size = gfs2_ea_strlen(ea);
+
+ if (ea->ea_type == GFS2_EATYPE_UNUSED)
+ return 0;
+
+ if (er->er_data_len) {
+ char *prefix = NULL;
+ unsigned int l = 0;
+ char c = 0;
+
+ if (ei->ei_size + ea_size > er->er_data_len)
+ return -ERANGE;
+
+ switch (ea->ea_type) {
+ case GFS2_EATYPE_USR:
+ prefix = "user.";
+ l = 5;
+ break;
+ case GFS2_EATYPE_SYS:
+ prefix = "system.";
+ l = 7;
+ break;
+ case GFS2_EATYPE_SECURITY:
+ prefix = "security.";
+ l = 9;
+ break;
+ }
+
+ BUG_ON(l == 0);
+
+ memcpy(er->er_data + ei->ei_size, prefix, l);
+ memcpy(er->er_data + ei->ei_size + l, GFS2_EA2NAME(ea),
+ ea->ea_name_len);
+ memcpy(er->er_data + ei->ei_size + ea_size - 1, &c, 1);
+ }
+
+ ei->ei_size += ea_size;
+
+ return 0;
+}
+
+/**
+ * gfs2_ea_list -
+ * @ip:
+ * @er:
+ *
+ * Returns: actual size of data on success, -errno on error
+ */
+
+int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ struct gfs2_holder i_gh;
+ int error;
+
+ if (!er->er_data || !er->er_data_len) {
+ er->er_data = NULL;
+ er->er_data_len = 0;
+ }
+
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+ if (error)
+ return error;
+
+ if (ip->i_di.di_eattr) {
+ struct ea_list ei = { .ei_er = er, .ei_size = 0 };
+
+ error = ea_foreach(ip, ea_list_i, &ei);
+ if (!error)
+ error = ei.ei_size;
+ }
+
+ gfs2_glock_dq_uninit(&i_gh);
+
+ return error;
+}
+
+/**
+ * ea_get_unstuffed - actually copies the unstuffed data into the
+ * request buffer
+ * @ip: The GFS2 inode
+ * @ea: The extended attribute header structure
+ * @data: The data to be copied
+ *
+ * Returns: errno
+ */
+
+static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
+ char *data)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct buffer_head **bh;
+ unsigned int amount = GFS2_EA_DATA_LEN(ea);
+ unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
+ u64 *dataptrs = GFS2_EA2DATAPTRS(ea);
+ unsigned int x;
+ int error = 0;
+
+ bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
+ if (!bh)
+ return -ENOMEM;
+
+ for (x = 0; x < nptrs; x++) {
+ error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
+ bh + x);
+ if (error) {
+ while (x--)
+ brelse(bh[x]);
+ goto out;
+ }
+ dataptrs++;
+ }
+
+ for (x = 0; x < nptrs; x++) {
+ error = gfs2_meta_wait(sdp, bh[x]);
+ if (error) {
+ for (; x < nptrs; x++)
+ brelse(bh[x]);
+ goto out;
+ }
+ if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
+ for (; x < nptrs; x++)
+ brelse(bh[x]);
+ error = -EIO;
+ goto out;
+ }
+
+ memcpy(data, bh[x]->b_data + sizeof(struct gfs2_meta_header),
+ (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
+
+ amount -= sdp->sd_jbsize;
+ data += sdp->sd_jbsize;
+
+ brelse(bh[x]);
+ }
+
+out:
+ kfree(bh);
+ return error;
+}
+
+int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+ char *data)
+{
+ if (GFS2_EA_IS_STUFFED(el->el_ea)) {
+ memcpy(data, GFS2_EA2DATA(el->el_ea), GFS2_EA_DATA_LEN(el->el_ea));
+ return 0;
+ } else
+ return ea_get_unstuffed(ip, el->el_ea, data);
+}
+
+/**
+ * gfs2_ea_get_i -
+ * @ip: The GFS2 inode
+ * @er: The request structure
+ *
+ * Returns: actual size of data on success, -errno on error
+ */
+
+int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ struct gfs2_ea_location el;
+ int error;
+
+ if (!ip->i_di.di_eattr)
+ return -ENODATA;
+
+ error = gfs2_ea_find(ip, er, &el);
+ if (error)
+ return error;
+ if (!el.el_ea)
+ return -ENODATA;
+
+ if (er->er_data_len) {
+ if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len)
+ error = -ERANGE;
+ else
+ error = gfs2_ea_get_copy(ip, &el, er->er_data);
+ }
+ if (!error)
+ error = GFS2_EA_DATA_LEN(el.el_ea);
+
+ brelse(el.el_bh);
+
+ return error;
+}
+
+/**
+ * gfs2_ea_get -
+ * @ip: The GFS2 inode
+ * @er: The request structure
+ *
+ * Returns: actual size of data on success, -errno on error
+ */
+
+int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ struct gfs2_holder i_gh;
+ int error;
+
+ if (!er->er_name_len ||
+ er->er_name_len > GFS2_EA_MAX_NAME_LEN)
+ return -EINVAL;
+ if (!er->er_data || !er->er_data_len) {
+ er->er_data = NULL;
+ er->er_data_len = 0;
+ }
+
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+ if (error)
+ return error;
+
+ error = gfs2_ea_ops[er->er_type]->eo_get(ip, er);
+
+ gfs2_glock_dq_uninit(&i_gh);
+
+ return error;
+}
+
+/**
+ * ea_alloc_blk - allocates a new block for extended attributes.
+ * @ip: A pointer to the inode that's getting extended attributes
+ * @bhp: Pointer to pointer to a struct buffer_head
+ *
+ * Returns: errno
+ */
+
+static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_ea_header *ea;
+ u64 block;
+
+ block = gfs2_alloc_meta(ip);
+
+ *bhp = gfs2_meta_new(ip->i_gl, block);
+ gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
+ gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
+ gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header));
+
+ ea = GFS2_EA_BH2FIRST(*bhp);
+ ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize);
+ ea->ea_type = GFS2_EATYPE_UNUSED;
+ ea->ea_flags = GFS2_EAFLAG_LAST;
+ ea->ea_num_ptrs = 0;
+
+ ip->i_di.di_blocks++;
+
+ return 0;
+}
+
+/**
+ * ea_write - writes the request info to an ea, creating new blocks if
+ * necessary
+ * @ip: inode that is being modified
+ * @ea: the location of the new ea in a block
+ * @er: the write request
+ *
+ * Note: does not update ea_rec_len or the GFS2_EAFLAG_LAST bin of ea_flags
+ *
+ * returns : errno
+ */
+
+static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
+ struct gfs2_ea_request *er)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+
+ ea->ea_data_len = cpu_to_be32(er->er_data_len);
+ ea->ea_name_len = er->er_name_len;
+ ea->ea_type = er->er_type;
+ ea->__pad = 0;
+
+ memcpy(GFS2_EA2NAME(ea), er->er_name, er->er_name_len);
+
+ if (GFS2_EAREQ_SIZE_STUFFED(er) <= sdp->sd_jbsize) {
+ ea->ea_num_ptrs = 0;
+ memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len);
+ } else {
+ u64 *dataptr = GFS2_EA2DATAPTRS(ea);
+ const char *data = er->er_data;
+ unsigned int data_len = er->er_data_len;
+ unsigned int copy;
+ unsigned int x;
+
+ ea->ea_num_ptrs = DIV_ROUND_UP(er->er_data_len, sdp->sd_jbsize);
+ for (x = 0; x < ea->ea_num_ptrs; x++) {
+ struct buffer_head *bh;
+ u64 block;
+ int mh_size = sizeof(struct gfs2_meta_header);
+
+ block = gfs2_alloc_meta(ip);
+
+ bh = gfs2_meta_new(ip->i_gl, block);
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
+
+ ip->i_di.di_blocks++;
+
+ copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize :
+ data_len;
+ memcpy(bh->b_data + mh_size, data, copy);
+ if (copy < sdp->sd_jbsize)
+ memset(bh->b_data + mh_size + copy, 0,
+ sdp->sd_jbsize - copy);
+
+ *dataptr++ = cpu_to_be64(bh->b_blocknr);
+ data += copy;
+ data_len -= copy;
+
+ brelse(bh);
+ }
+
+ gfs2_assert_withdraw(sdp, !data_len);
+ }
+
+ return 0;
+}
+
+typedef int (*ea_skeleton_call_t) (struct gfs2_inode *ip,
+ struct gfs2_ea_request *er, void *private);
+
+static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+ unsigned int blks,
+ ea_skeleton_call_t skeleton_call, void *private)
+{
+ struct gfs2_alloc *al;
+ struct buffer_head *dibh;
+ int error;
+
+ al = gfs2_alloc_get(ip);
+
+ error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ if (error)
+ goto out;
+
+ error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
+ if (error)
+ goto out_gunlock_q;
+
+ al->al_requested = blks;
+
+ error = gfs2_inplace_reserve(ip);
+ if (error)
+ goto out_gunlock_q;
+
+ error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
+ blks + al->al_rgd->rd_ri.ri_length +
+ RES_DINODE + RES_STATFS + RES_QUOTA, 0);
+ if (error)
+ goto out_ipres;
+
+ error = skeleton_call(ip, er, private);
+ if (error)
+ goto out_end_trans;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (!error) {
+ if (er->er_flags & GFS2_ERF_MODE) {
+ gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
+ (ip->i_di.di_mode & S_IFMT) ==
+ (er->er_mode & S_IFMT));
+ ip->i_di.di_mode = er->er_mode;
+ }
+ ip->i_di.di_ctime = get_seconds();
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+ }
+
+out_end_trans:
+ gfs2_trans_end(GFS2_SB(&ip->i_inode));
+out_ipres:
+ gfs2_inplace_release(ip);
+out_gunlock_q:
+ gfs2_quota_unlock(ip);
+out:
+ gfs2_alloc_put(ip);
+ return error;
+}
+
+static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+ void *private)
+{
+ struct buffer_head *bh;
+ int error;
+
+ error = ea_alloc_blk(ip, &bh);
+ if (error)
+ return error;
+
+ ip->i_di.di_eattr = bh->b_blocknr;
+ error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
+
+ brelse(bh);
+
+ return error;
+}
+
+/**
+ * ea_init - initializes a new eattr block
+ * @ip:
+ * @er:
+ *
+ * Returns: errno
+ */
+
+static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize;
+ unsigned int blks = 1;
+
+ if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize)
+ blks += DIV_ROUND_UP(er->er_data_len, jbsize);
+
+ return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL);
+}
+
+static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
+{
+ u32 ea_size = GFS2_EA_SIZE(ea);
+ struct gfs2_ea_header *new = (struct gfs2_ea_header *)((char *)ea +
+ ea_size);
+ u32 new_size = GFS2_EA_REC_LEN(ea) - ea_size;
+ int last = ea->ea_flags & GFS2_EAFLAG_LAST;
+
+ ea->ea_rec_len = cpu_to_be32(ea_size);
+ ea->ea_flags ^= last;
+
+ new->ea_rec_len = cpu_to_be32(new_size);
+ new->ea_flags = last;
+
+ return new;
+}
+
+static void ea_set_remove_stuffed(struct gfs2_inode *ip,
+ struct gfs2_ea_location *el)
+{
+ struct gfs2_ea_header *ea = el->el_ea;
+ struct gfs2_ea_header *prev = el->el_prev;
+ u32 len;
+
+ gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+
+ if (!prev || !GFS2_EA_IS_STUFFED(ea)) {
+ ea->ea_type = GFS2_EATYPE_UNUSED;
+ return;
+ } else if (GFS2_EA2NEXT(prev) != ea) {
+ prev = GFS2_EA2NEXT(prev);
+ gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), GFS2_EA2NEXT(prev) == ea);
+ }
+
+ len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+ prev->ea_rec_len = cpu_to_be32(len);
+
+ if (GFS2_EA_IS_LAST(ea))
+ prev->ea_flags |= GFS2_EAFLAG_LAST;
+}
+
+struct ea_set {
+ int ea_split;
+
+ struct gfs2_ea_request *es_er;
+ struct gfs2_ea_location *es_el;
+
+ struct buffer_head *es_bh;
+ struct gfs2_ea_header *es_ea;
+};
+
+static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
+ struct gfs2_ea_header *ea, struct ea_set *es)
+{
+ struct gfs2_ea_request *er = es->es_er;
+ struct buffer_head *dibh;
+ int error;
+
+ error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + 2 * RES_EATTR, 0);
+ if (error)
+ return error;
+
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+
+ if (es->ea_split)
+ ea = ea_split_ea(ea);
+
+ ea_write(ip, ea, er);
+
+ if (es->es_el)
+ ea_set_remove_stuffed(ip, es->es_el);
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ goto out;
+
+ if (er->er_flags & GFS2_ERF_MODE) {
+ gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
+ (ip->i_di.di_mode & S_IFMT) == (er->er_mode & S_IFMT));
+ ip->i_di.di_mode = er->er_mode;
+ }
+ ip->i_di.di_ctime = get_seconds();
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+out:
+ gfs2_trans_end(GFS2_SB(&ip->i_inode));
+ return error;
+}
+
+static int ea_set_simple_alloc(struct gfs2_inode *ip,
+ struct gfs2_ea_request *er, void *private)
+{
+ struct ea_set *es = private;
+ struct gfs2_ea_header *ea = es->es_ea;
+ int error;
+
+ gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1);
+
+ if (es->ea_split)
+ ea = ea_split_ea(ea);
+
+ error = ea_write(ip, ea, er);
+ if (error)
+ return error;
+
+ if (es->es_el)
+ ea_set_remove_stuffed(ip, es->es_el);
+
+ return 0;
+}
+
+static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
+ struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+ void *private)
+{
+ struct ea_set *es = private;
+ unsigned int size;
+ int stuffed;
+ int error;
+
+ stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er, &size);
+
+ if (ea->ea_type == GFS2_EATYPE_UNUSED) {
+ if (GFS2_EA_REC_LEN(ea) < size)
+ return 0;
+ if (!GFS2_EA_IS_STUFFED(ea)) {
+ error = ea_remove_unstuffed(ip, bh, ea, prev, 1);
+ if (error)
+ return error;
+ }
+ es->ea_split = 0;
+ } else if (GFS2_EA_REC_LEN(ea) - GFS2_EA_SIZE(ea) >= size)
+ es->ea_split = 1;
+ else
+ return 0;
+
+ if (stuffed) {
+ error = ea_set_simple_noalloc(ip, bh, ea, es);
+ if (error)
+ return error;
+ } else {
+ unsigned int blks;
+
+ es->es_bh = bh;
+ es->es_ea = ea;
+ blks = 2 + DIV_ROUND_UP(es->es_er->er_data_len,
+ GFS2_SB(&ip->i_inode)->sd_jbsize);
+
+ error = ea_alloc_skeleton(ip, es->es_er, blks,
+ ea_set_simple_alloc, es);
+ if (error)
+ return error;
+ }
+
+ return 1;
+}
+
+static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+ void *private)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct buffer_head *indbh, *newbh;
+ u64 *eablk;
+ int error;
+ int mh_size = sizeof(struct gfs2_meta_header);
+
+ if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
+ u64 *end;
+
+ error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT,
+ &indbh);
+ if (error)
+ return error;
+
+ if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
+ error = -EIO;
+ goto out;
+ }
+
+ eablk = (u64 *)(indbh->b_data + mh_size);
+ end = eablk + sdp->sd_inptrs;
+
+ for (; eablk < end; eablk++)
+ if (!*eablk)
+ break;
+
+ if (eablk == end) {
+ error = -ENOSPC;
+ goto out;
+ }
+
+ gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+ } else {
+ u64 blk;
+
+ blk = gfs2_alloc_meta(ip);
+
+ indbh = gfs2_meta_new(ip->i_gl, blk);
+ gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+ gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+ gfs2_buffer_clear_tail(indbh, mh_size);
+
+ eablk = (u64 *)(indbh->b_data + mh_size);
+ *eablk = cpu_to_be64(ip->i_di.di_eattr);
+ ip->i_di.di_eattr = blk;
+ ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
+ ip->i_di.di_blocks++;
+
+ eablk++;
+ }
+
+ error = ea_alloc_blk(ip, &newbh);
+ if (error)
+ goto out;
+
+ *eablk = cpu_to_be64((u64)newbh->b_blocknr);
+ error = ea_write(ip, GFS2_EA_BH2FIRST(newbh), er);
+ brelse(newbh);
+ if (error)
+ goto out;
+
+ if (private)
+ ea_set_remove_stuffed(ip, private);
+
+out:
+ brelse(indbh);
+ return error;
+}
+
+static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+ struct gfs2_ea_location *el)
+{
+ struct ea_set es;
+ unsigned int blks = 2;
+ int error;
+
+ memset(&es, 0, sizeof(struct ea_set));
+ es.es_er = er;
+ es.es_el = el;
+
+ error = ea_foreach(ip, ea_set_simple, &es);
+ if (error > 0)
+ return 0;
+ if (error)
+ return error;
+
+ if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT))
+ blks++;
+ if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
+ blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
+
+ return ea_alloc_skeleton(ip, er, blks, ea_set_block, el);
+}
+
+static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
+ struct gfs2_ea_location *el)
+{
+ if (el->el_prev && GFS2_EA2NEXT(el->el_prev) != el->el_ea) {
+ el->el_prev = GFS2_EA2NEXT(el->el_prev);
+ gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
+ GFS2_EA2NEXT(el->el_prev) == el->el_ea);
+ }
+
+ return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
+}
+
+int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ struct gfs2_ea_location el;
+ int error;
+
+ if (!ip->i_di.di_eattr) {
+ if (er->er_flags & XATTR_REPLACE)
+ return -ENODATA;
+ return ea_init(ip, er);
+ }
+
+ error = gfs2_ea_find(ip, er, &el);
+ if (error)
+ return error;
+
+ if (el.el_ea) {
+ if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) {
+ brelse(el.el_bh);
+ return -EPERM;
+ }
+
+ error = -EEXIST;
+ if (!(er->er_flags & XATTR_CREATE)) {
+ int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
+ error = ea_set_i(ip, er, &el);
+ if (!error && unstuffed)
+ ea_set_remove_unstuffed(ip, &el);
+ }
+
+ brelse(el.el_bh);
+ } else {
+ error = -ENODATA;
+ if (!(er->er_flags & XATTR_REPLACE))
+ error = ea_set_i(ip, er, NULL);
+ }
+
+ return error;
+}
+
+int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ struct gfs2_holder i_gh;
+ int error;
+
+ if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
+ return -EINVAL;
+ if (!er->er_data || !er->er_data_len) {
+ er->er_data = NULL;
+ er->er_data_len = 0;
+ }
+ error = ea_check_size(GFS2_SB(&ip->i_inode), er);
+ if (error)
+ return error;
+
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+ if (error)
+ return error;
+
+ if (IS_IMMUTABLE(&ip->i_inode))
+ error = -EPERM;
+ else
+ error = gfs2_ea_ops[er->er_type]->eo_set(ip, er);
+
+ gfs2_glock_dq_uninit(&i_gh);
+
+ return error;
+}
+
+static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
+{
+ struct gfs2_ea_header *ea = el->el_ea;
+ struct gfs2_ea_header *prev = el->el_prev;
+ struct buffer_head *dibh;
+ int error;
+
+ error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
+ if (error)
+ return error;
+
+ gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+
+ if (prev) {
+ u32 len;
+
+ len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+ prev->ea_rec_len = cpu_to_be32(len);
+
+ if (GFS2_EA_IS_LAST(ea))
+ prev->ea_flags |= GFS2_EAFLAG_LAST;
+ } else
+ ea->ea_type = GFS2_EATYPE_UNUSED;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (!error) {
+ ip->i_di.di_ctime = get_seconds();
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+ }
+
+ gfs2_trans_end(GFS2_SB(&ip->i_inode));
+
+ return error;
+}
+
+int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ struct gfs2_ea_location el;
+ int error;
+
+ if (!ip->i_di.di_eattr)
+ return -ENODATA;
+
+ error = gfs2_ea_find(ip, er, &el);
+ if (error)
+ return error;
+ if (!el.el_ea)
+ return -ENODATA;
+
+ if (GFS2_EA_IS_STUFFED(el.el_ea))
+ error = ea_remove_stuffed(ip, &el);
+ else
+ error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev,
+ 0);
+
+ brelse(el.el_bh);
+
+ return error;
+}
+
+/**
+ * gfs2_ea_remove - sets (or creates or replaces) an extended attribute
+ * @ip: pointer to the inode of the target file
+ * @er: request information
+ *
+ * Returns: errno
+ */
+
+int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ struct gfs2_holder i_gh;
+ int error;
+
+ if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
+ return -EINVAL;
+
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+ if (error)
+ return error;
+
+ if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
+ error = -EPERM;
+ else
+ error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er);
+
+ gfs2_glock_dq_uninit(&i_gh);
+
+ return error;
+}
+
+static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
+ struct gfs2_ea_header *ea, char *data)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct buffer_head **bh;
+ unsigned int amount = GFS2_EA_DATA_LEN(ea);
+ unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
+ u64 *dataptrs = GFS2_EA2DATAPTRS(ea);
+ unsigned int x;
+ int error;
+
+ bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
+ if (!bh)
+ return -ENOMEM;
+
+ error = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
+ if (error)
+ goto out;
+
+ for (x = 0; x < nptrs; x++) {
+ error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
+ bh + x);
+ if (error) {
+ while (x--)
+ brelse(bh[x]);
+ goto fail;
+ }
+ dataptrs++;
+ }
+
+ for (x = 0; x < nptrs; x++) {
+ error = gfs2_meta_wait(sdp, bh[x]);
+ if (error) {
+ for (; x < nptrs; x++)
+ brelse(bh[x]);
+ goto fail;
+ }
+ if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
+ for (; x < nptrs; x++)
+ brelse(bh[x]);
+ error = -EIO;
+ goto fail;
+ }
+
+ gfs2_trans_add_bh(ip->i_gl, bh[x], 1);
+
+ memcpy(bh[x]->b_data + sizeof(struct gfs2_meta_header), data,
+ (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
+
+ amount -= sdp->sd_jbsize;
+ data += sdp->sd_jbsize;
+
+ brelse(bh[x]);
+ }
+
+out:
+ kfree(bh);
+ return error;
+
+fail:
+ gfs2_trans_end(sdp);
+ kfree(bh);
+ return error;
+}
+
+int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+ struct iattr *attr, char *data)
+{
+ struct buffer_head *dibh;
+ int error;
+
+ if (GFS2_EA_IS_STUFFED(el->el_ea)) {
+ error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
+ if (error)
+ return error;
+
+ gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+ memcpy(GFS2_EA2DATA(el->el_ea), data,
+ GFS2_EA_DATA_LEN(el->el_ea));
+ } else
+ error = ea_acl_chmod_unstuffed(ip, el->el_ea, data);
+
+ if (error)
+ return error;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (!error) {
+ error = inode_setattr(&ip->i_inode, attr);
+ gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
+ gfs2_inode_attr_out(ip);
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+ }
+
+ gfs2_trans_end(GFS2_SB(&ip->i_inode));
+
+ return error;
+}
+
+static int ea_dealloc_indirect(struct gfs2_inode *ip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_rgrp_list rlist;
+ struct buffer_head *indbh, *dibh;
+ u64 *eablk, *end;
+ unsigned int rg_blocks = 0;
+ u64 bstart = 0;
+ unsigned int blen = 0;
+ unsigned int blks = 0;
+ unsigned int x;
+ int error;
+
+ memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
+
+ error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &indbh);
+ if (error)
+ return error;
+
+ if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
+ error = -EIO;
+ goto out;
+ }
+
+ eablk = (u64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
+ end = eablk + sdp->sd_inptrs;
+
+ for (; eablk < end; eablk++) {
+ u64 bn;
+
+ if (!*eablk)
+ break;
+ bn = be64_to_cpu(*eablk);
+
+ if (bstart + blen == bn)
+ blen++;
+ else {
+ if (bstart)
+ gfs2_rlist_add(sdp, &rlist, bstart);
+ bstart = bn;
+ blen = 1;
+ }
+ blks++;
+ }
+ if (bstart)
+ gfs2_rlist_add(sdp, &rlist, bstart);
+ else
+ goto out;
+
+ gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+
+ for (x = 0; x < rlist.rl_rgrps; x++) {
+ struct gfs2_rgrpd *rgd;
+ rgd = rlist.rl_ghs[x].gh_gl->gl_object;
+ rg_blocks += rgd->rd_ri.ri_length;
+ }
+
+ error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
+ if (error)
+ goto out_rlist_free;
+
+ error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + RES_INDIRECT +
+ RES_STATFS + RES_QUOTA, blks);
+ if (error)
+ goto out_gunlock;
+
+ gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+
+ eablk = (u64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
+ bstart = 0;
+ blen = 0;
+
+ for (; eablk < end; eablk++) {
+ u64 bn;
+
+ if (!*eablk)
+ break;
+ bn = be64_to_cpu(*eablk);
+
+ if (bstart + blen == bn)
+ blen++;
+ else {
+ if (bstart)
+ gfs2_free_meta(ip, bstart, blen);
+ bstart = bn;
+ blen = 1;
+ }
+
+ *eablk = 0;
+ if (!ip->i_di.di_blocks)
+ gfs2_consist_inode(ip);
+ ip->i_di.di_blocks--;
+ }
+ if (bstart)
+ gfs2_free_meta(ip, bstart, blen);
+
+ ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (!error) {
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+ }
+
+ gfs2_trans_end(sdp);
+
+out_gunlock:
+ gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
+out_rlist_free:
+ gfs2_rlist_free(&rlist);
+out:
+ brelse(indbh);
+ return error;
+}
+
+static int ea_dealloc_block(struct gfs2_inode *ip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_alloc *al = &ip->i_alloc;
+ struct gfs2_rgrpd *rgd;
+ struct buffer_head *dibh;
+ int error;
+
+ rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr);
+ if (!rgd) {
+ gfs2_consist_inode(ip);
+ return -EIO;
+ }
+
+ error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
+ &al->al_rgd_gh);
+ if (error)
+ return error;
+
+ error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_DINODE + RES_STATFS +
+ RES_QUOTA, 1);
+ if (error)
+ goto out_gunlock;
+
+ gfs2_free_meta(ip, ip->i_di.di_eattr, 1);
+
+ ip->i_di.di_eattr = 0;
+ if (!ip->i_di.di_blocks)
+ gfs2_consist_inode(ip);
+ ip->i_di.di_blocks--;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (!error) {
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+ }
+
+ gfs2_trans_end(sdp);
+
+out_gunlock:
+ gfs2_glock_dq_uninit(&al->al_rgd_gh);
+ return error;
+}
+
+/**
+ * gfs2_ea_dealloc - deallocate the extended attribute fork
+ * @ip: the inode
+ *
+ * Returns: errno
+ */
+
+int gfs2_ea_dealloc(struct gfs2_inode *ip)
+{
+ struct gfs2_alloc *al;
+ int error;
+
+ al = gfs2_alloc_get(ip);
+
+ error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ if (error)
+ goto out_alloc;
+
+ error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
+ if (error)
+ goto out_quota;
+
+ error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
+ if (error)
+ goto out_rindex;
+
+ if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
+ error = ea_dealloc_indirect(ip);
+ if (error)
+ goto out_rindex;
+ }
+
+ error = ea_dealloc_block(ip);
+
+out_rindex:
+ gfs2_glock_dq_uninit(&al->al_ri_gh);
+out_quota:
+ gfs2_quota_unhold(ip);
+out_alloc:
+ gfs2_alloc_put(ip);
+ return error;
+}
+
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/eattr.h
new file mode 100644
index 000000000000..ffa65947d686
--- /dev/null
+++ b/fs/gfs2/eattr.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __EATTR_DOT_H__
+#define __EATTR_DOT_H__
+
+struct gfs2_inode;
+struct iattr;
+
+#define GFS2_EA_REC_LEN(ea) be32_to_cpu((ea)->ea_rec_len)
+#define GFS2_EA_DATA_LEN(ea) be32_to_cpu((ea)->ea_data_len)
+
+#define GFS2_EA_SIZE(ea) \
+ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
+ ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
+ (sizeof(u64) * (ea)->ea_num_ptrs)), 8)
+
+#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
+#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
+
+#define GFS2_EAREQ_SIZE_STUFFED(er) \
+ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
+
+#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \
+ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
+ sizeof(u64) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8)
+
+#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
+#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
+
+#define GFS2_EA2DATAPTRS(ea) \
+((u64 *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8)))
+
+#define GFS2_EA2NEXT(ea) \
+((struct gfs2_ea_header *)((char *)(ea) + GFS2_EA_REC_LEN(ea)))
+
+#define GFS2_EA_BH2FIRST(bh) \
+((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
+
+#define GFS2_ERF_MODE 0x80000000
+
+struct gfs2_ea_request {
+ const char *er_name;
+ char *er_data;
+ unsigned int er_name_len;
+ unsigned int er_data_len;
+ unsigned int er_type; /* GFS2_EATYPE_... */
+ int er_flags;
+ mode_t er_mode;
+};
+
+struct gfs2_ea_location {
+ struct buffer_head *el_bh;
+ struct gfs2_ea_header *el_ea;
+ struct gfs2_ea_header *el_prev;
+};
+
+int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+
+int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+
+int gfs2_ea_dealloc(struct gfs2_inode *ip);
+
+/* Exported to acl.c */
+
+int gfs2_ea_find(struct gfs2_inode *ip,
+ struct gfs2_ea_request *er,
+ struct gfs2_ea_location *el);
+int gfs2_ea_get_copy(struct gfs2_inode *ip,
+ struct gfs2_ea_location *el,
+ char *data);
+int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+ struct iattr *attr, char *data);
+
+static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
+{
+ switch (ea->ea_type) {
+ case GFS2_EATYPE_USR:
+ return 5 + ea->ea_name_len + 1;
+ case GFS2_EATYPE_SYS:
+ return 7 + ea->ea_name_len + 1;
+ case GFS2_EATYPE_SECURITY:
+ return 9 + ea->ea_name_len + 1;
+ default:
+ return 0;
+ }
+}
+
+#endif /* __EATTR_DOT_H__ */
diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
new file mode 100644
index 000000000000..3bb11c0f8b56
--- /dev/null
+++ b/fs/gfs2/gfs2.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __GFS2_DOT_H__
+#define __GFS2_DOT_H__
+
+enum {
+ NO_CREATE = 0,
+ CREATE = 1,
+};
+
+enum {
+ NO_WAIT = 0,
+ WAIT = 1,
+};
+
+enum {
+ NO_FORCE = 0,
+ FORCE = 1,
+};
+
+#define GFS2_FAST_NAME_SIZE 8
+
+#endif /* __GFS2_DOT_H__ */
+
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
new file mode 100644
index 000000000000..78fe0fae23ff
--- /dev/null
+++ b/fs/gfs2/glock.c
@@ -0,0 +1,2231 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/delay.h>
+#include <linux/sort.h>
+#include <linux/jhash.h>
+#include <linux/kallsyms.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/list.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "lm.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "super.h"
+#include "util.h"
+
+struct greedy {
+ struct gfs2_holder gr_gh;
+ struct work_struct gr_work;
+};
+
+struct gfs2_gl_hash_bucket {
+ struct hlist_head hb_list;
+};
+
+typedef void (*glock_examiner) (struct gfs2_glock * gl);
+
+static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
+static int dump_glock(struct gfs2_glock *gl);
+static int dump_inode(struct gfs2_inode *ip);
+
+#define GFS2_GL_HASH_SHIFT 15
+#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
+#define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1)
+
+static struct gfs2_gl_hash_bucket gl_hash_table[GFS2_GL_HASH_SIZE];
+
+/*
+ * Despite what you might think, the numbers below are not arbitrary :-)
+ * They are taken from the ipv4 routing hash code, which is well tested
+ * and thus should be nearly optimal. Later on we might tweek the numbers
+ * but for now this should be fine.
+ *
+ * The reason for putting the locks in a separate array from the list heads
+ * is that we can have fewer locks than list heads and save memory. We use
+ * the same hash function for both, but with a different hash mask.
+ */
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
+ defined(CONFIG_PROVE_LOCKING)
+
+#ifdef CONFIG_LOCKDEP
+# define GL_HASH_LOCK_SZ 256
+#else
+# if NR_CPUS >= 32
+# define GL_HASH_LOCK_SZ 4096
+# elif NR_CPUS >= 16
+# define GL_HASH_LOCK_SZ 2048
+# elif NR_CPUS >= 8
+# define GL_HASH_LOCK_SZ 1024
+# elif NR_CPUS >= 4
+# define GL_HASH_LOCK_SZ 512
+# else
+# define GL_HASH_LOCK_SZ 256
+# endif
+#endif
+
+/* We never want more locks than chains */
+#if GFS2_GL_HASH_SIZE < GL_HASH_LOCK_SZ
+# undef GL_HASH_LOCK_SZ
+# define GL_HASH_LOCK_SZ GFS2_GL_HASH_SIZE
+#endif
+
+static rwlock_t gl_hash_locks[GL_HASH_LOCK_SZ];
+
+static inline rwlock_t *gl_lock_addr(unsigned int x)
+{
+ return &gl_hash_locks[x & (GL_HASH_LOCK_SZ-1)];
+}
+#else /* not SMP, so no spinlocks required */
+static inline rwlock_t *gl_lock_addr(x)
+{
+ return NULL;
+}
+#endif
+
+/**
+ * relaxed_state_ok - is a requested lock compatible with the current lock mode?
+ * @actual: the current state of the lock
+ * @requested: the lock state that was requested by the caller
+ * @flags: the modifier flags passed in by the caller
+ *
+ * Returns: 1 if the locks are compatible, 0 otherwise
+ */
+
+static inline int relaxed_state_ok(unsigned int actual, unsigned requested,
+ int flags)
+{
+ if (actual == requested)
+ return 1;
+
+ if (flags & GL_EXACT)
+ return 0;
+
+ if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED)
+ return 1;
+
+ if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY))
+ return 1;
+
+ return 0;
+}
+
+/**
+ * gl_hash() - Turn glock number into hash bucket number
+ * @lock: The glock number
+ *
+ * Returns: The number of the corresponding hash bucket
+ */
+
+static unsigned int gl_hash(const struct gfs2_sbd *sdp,
+ const struct lm_lockname *name)
+{
+ unsigned int h;
+
+ h = jhash(&name->ln_number, sizeof(u64), 0);
+ h = jhash(&name->ln_type, sizeof(unsigned int), h);
+ h = jhash(&sdp, sizeof(struct gfs2_sbd *), h);
+ h &= GFS2_GL_HASH_MASK;
+
+ return h;
+}
+
+/**
+ * glock_free() - Perform a few checks and then release struct gfs2_glock
+ * @gl: The glock to release
+ *
+ * Also calls lock module to release its internal structure for this glock.
+ *
+ */
+
+static void glock_free(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct inode *aspace = gl->gl_aspace;
+
+ gfs2_lm_put_lock(sdp, gl->gl_lock);
+
+ if (aspace)
+ gfs2_aspace_put(aspace);
+
+ kmem_cache_free(gfs2_glock_cachep, gl);
+}
+
+/**
+ * gfs2_glock_hold() - increment reference count on glock
+ * @gl: The glock to hold
+ *
+ */
+
+void gfs2_glock_hold(struct gfs2_glock *gl)
+{
+ atomic_inc(&gl->gl_ref);
+}
+
+/**
+ * gfs2_glock_put() - Decrement reference count on glock
+ * @gl: The glock to put
+ *
+ */
+
+int gfs2_glock_put(struct gfs2_glock *gl)
+{
+ int rv = 0;
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+
+ write_lock(gl_lock_addr(gl->gl_hash));
+ if (atomic_dec_and_test(&gl->gl_ref)) {
+ hlist_del(&gl->gl_list);
+ write_unlock(gl_lock_addr(gl->gl_hash));
+ BUG_ON(spin_is_locked(&gl->gl_spin));
+ gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
+ gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
+ gfs2_assert(sdp, list_empty(&gl->gl_holders));
+ gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
+ gfs2_assert(sdp, list_empty(&gl->gl_waiters2));
+ gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
+ glock_free(gl);
+ rv = 1;
+ goto out;
+ }
+ write_unlock(gl_lock_addr(gl->gl_hash));
+out:
+ return rv;
+}
+
+/**
+ * queue_empty - check to see if a glock's queue is empty
+ * @gl: the glock
+ * @head: the head of the queue to check
+ *
+ * This function protects the list in the event that a process already
+ * has a holder on the list and is adding a second holder for itself.
+ * The glmutex lock is what generally prevents processes from working
+ * on the same glock at once, but the special case of adding a second
+ * holder for yourself ("recursive" locking) doesn't involve locking
+ * glmutex, making the spin lock necessary.
+ *
+ * Returns: 1 if the queue is empty
+ */
+
+static inline int queue_empty(struct gfs2_glock *gl, struct list_head *head)
+{
+ int empty;
+ spin_lock(&gl->gl_spin);
+ empty = list_empty(head);
+ spin_unlock(&gl->gl_spin);
+ return empty;
+}
+
+/**
+ * search_bucket() - Find struct gfs2_glock by lock number
+ * @bucket: the bucket to search
+ * @name: The lock name
+ *
+ * Returns: NULL, or the struct gfs2_glock with the requested number
+ */
+
+static struct gfs2_glock *search_bucket(unsigned int hash,
+ const struct gfs2_sbd *sdp,
+ const struct lm_lockname *name)
+{
+ struct gfs2_glock *gl;
+ struct hlist_node *h;
+
+ hlist_for_each_entry(gl, h, &gl_hash_table[hash].hb_list, gl_list) {
+ if (!lm_name_equal(&gl->gl_name, name))
+ continue;
+ if (gl->gl_sbd != sdp)
+ continue;
+
+ atomic_inc(&gl->gl_ref);
+
+ return gl;
+ }
+
+ return NULL;
+}
+
+/**
+ * gfs2_glock_find() - Find glock by lock number
+ * @sdp: The GFS2 superblock
+ * @name: The lock name
+ *
+ * Returns: NULL, or the struct gfs2_glock with the requested number
+ */
+
+static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp,
+ const struct lm_lockname *name)
+{
+ unsigned int hash = gl_hash(sdp, name);
+ struct gfs2_glock *gl;
+
+ read_lock(gl_lock_addr(hash));
+ gl = search_bucket(hash, sdp, name);
+ read_unlock(gl_lock_addr(hash));
+
+ return gl;
+}
+
+/**
+ * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
+ * @sdp: The GFS2 superblock
+ * @number: the lock number
+ * @glops: The glock_operations to use
+ * @create: If 0, don't create the glock if it doesn't exist
+ * @glp: the glock is returned here
+ *
+ * This does not lock a glock, just finds/creates structures for one.
+ *
+ * Returns: errno
+ */
+
+int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
+ const struct gfs2_glock_operations *glops, int create,
+ struct gfs2_glock **glp)
+{
+ struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type };
+ struct gfs2_glock *gl, *tmp;
+ unsigned int hash = gl_hash(sdp, &name);
+ int error;
+
+ read_lock(gl_lock_addr(hash));
+ gl = search_bucket(hash, sdp, &name);
+ read_unlock(gl_lock_addr(hash));
+
+ if (gl || !create) {
+ *glp = gl;
+ return 0;
+ }
+
+ gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
+ if (!gl)
+ return -ENOMEM;
+
+ gl->gl_flags = 0;
+ gl->gl_name = name;
+ atomic_set(&gl->gl_ref, 1);
+ gl->gl_state = LM_ST_UNLOCKED;
+ gl->gl_hash = hash;
+ gl->gl_owner = NULL;
+ gl->gl_ip = 0;
+ gl->gl_ops = glops;
+ gl->gl_req_gh = NULL;
+ gl->gl_req_bh = NULL;
+ gl->gl_vn = 0;
+ gl->gl_stamp = jiffies;
+ gl->gl_object = NULL;
+ gl->gl_sbd = sdp;
+ gl->gl_aspace = NULL;
+ lops_init_le(&gl->gl_le, &gfs2_glock_lops);
+
+ /* If this glock protects actual on-disk data or metadata blocks,
+ create a VFS inode to manage the pages/buffers holding them. */
+ if (glops == &gfs2_inode_glops || glops == &gfs2_rgrp_glops) {
+ gl->gl_aspace = gfs2_aspace_get(sdp);
+ if (!gl->gl_aspace) {
+ error = -ENOMEM;
+ goto fail;
+ }
+ }
+
+ error = gfs2_lm_get_lock(sdp, &name, &gl->gl_lock);
+ if (error)
+ goto fail_aspace;
+
+ write_lock(gl_lock_addr(hash));
+ tmp = search_bucket(hash, sdp, &name);
+ if (tmp) {
+ write_unlock(gl_lock_addr(hash));
+ glock_free(gl);
+ gl = tmp;
+ } else {
+ hlist_add_head(&gl->gl_list, &gl_hash_table[hash].hb_list);
+ write_unlock(gl_lock_addr(hash));
+ }
+
+ *glp = gl;
+
+ return 0;
+
+fail_aspace:
+ if (gl->gl_aspace)
+ gfs2_aspace_put(gl->gl_aspace);
+fail:
+ kmem_cache_free(gfs2_glock_cachep, gl);
+ return error;
+}
+
+/**
+ * gfs2_holder_init - initialize a struct gfs2_holder in the default way
+ * @gl: the glock
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gh: the holder structure
+ *
+ */
+
+void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
+ struct gfs2_holder *gh)
+{
+ INIT_LIST_HEAD(&gh->gh_list);
+ gh->gh_gl = gl;
+ gh->gh_ip = (unsigned long)__builtin_return_address(0);
+ gh->gh_owner = current;
+ gh->gh_state = state;
+ gh->gh_flags = flags;
+ gh->gh_error = 0;
+ gh->gh_iflags = 0;
+ init_completion(&gh->gh_wait);
+
+ if (gh->gh_state == LM_ST_EXCLUSIVE)
+ gh->gh_flags |= GL_LOCAL_EXCL;
+
+ gfs2_glock_hold(gl);
+}
+
+/**
+ * gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gh: the holder structure
+ *
+ * Don't mess with the glock.
+ *
+ */
+
+void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh)
+{
+ gh->gh_state = state;
+ gh->gh_flags = flags;
+ if (gh->gh_state == LM_ST_EXCLUSIVE)
+ gh->gh_flags |= GL_LOCAL_EXCL;
+
+ gh->gh_iflags &= 1 << HIF_ALLOCED;
+ gh->gh_ip = (unsigned long)__builtin_return_address(0);
+}
+
+/**
+ * gfs2_holder_uninit - uninitialize a holder structure (drop glock reference)
+ * @gh: the holder structure
+ *
+ */
+
+void gfs2_holder_uninit(struct gfs2_holder *gh)
+{
+ gfs2_glock_put(gh->gh_gl);
+ gh->gh_gl = NULL;
+ gh->gh_ip = 0;
+}
+
+/**
+ * gfs2_holder_get - get a struct gfs2_holder structure
+ * @gl: the glock
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gfp_flags:
+ *
+ * Figure out how big an impact this function has. Either:
+ * 1) Replace it with a cache of structures hanging off the struct gfs2_sbd
+ * 2) Leave it like it is
+ *
+ * Returns: the holder structure, NULL on ENOMEM
+ */
+
+static struct gfs2_holder *gfs2_holder_get(struct gfs2_glock *gl,
+ unsigned int state,
+ int flags, gfp_t gfp_flags)
+{
+ struct gfs2_holder *gh;
+
+ gh = kmalloc(sizeof(struct gfs2_holder), gfp_flags);
+ if (!gh)
+ return NULL;
+
+ gfs2_holder_init(gl, state, flags, gh);
+ set_bit(HIF_ALLOCED, &gh->gh_iflags);
+ gh->gh_ip = (unsigned long)__builtin_return_address(0);
+ return gh;
+}
+
+/**
+ * gfs2_holder_put - get rid of a struct gfs2_holder structure
+ * @gh: the holder structure
+ *
+ */
+
+static void gfs2_holder_put(struct gfs2_holder *gh)
+{
+ gfs2_holder_uninit(gh);
+ kfree(gh);
+}
+
+/**
+ * rq_mutex - process a mutex request in the queue
+ * @gh: the glock holder
+ *
+ * Returns: 1 if the queue is blocked
+ */
+
+static int rq_mutex(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+
+ list_del_init(&gh->gh_list);
+ /* gh->gh_error never examined. */
+ set_bit(GLF_LOCK, &gl->gl_flags);
+ complete(&gh->gh_wait);
+
+ return 1;
+}
+
+/**
+ * rq_promote - process a promote request in the queue
+ * @gh: the glock holder
+ *
+ * Acquire a new inter-node lock, or change a lock state to more restrictive.
+ *
+ * Returns: 1 if the queue is blocked
+ */
+
+static int rq_promote(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+
+ if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
+ if (list_empty(&gl->gl_holders)) {
+ gl->gl_req_gh = gh;
+ set_bit(GLF_LOCK, &gl->gl_flags);
+ spin_unlock(&gl->gl_spin);
+
+ if (atomic_read(&sdp->sd_reclaim_count) >
+ gfs2_tune_get(sdp, gt_reclaim_limit) &&
+ !(gh->gh_flags & LM_FLAG_PRIORITY)) {
+ gfs2_reclaim_glock(sdp);
+ gfs2_reclaim_glock(sdp);
+ }
+
+ glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags);
+ spin_lock(&gl->gl_spin);
+ }
+ return 1;
+ }
+
+ if (list_empty(&gl->gl_holders)) {
+ set_bit(HIF_FIRST, &gh->gh_iflags);
+ set_bit(GLF_LOCK, &gl->gl_flags);
+ } else {
+ struct gfs2_holder *next_gh;
+ if (gh->gh_flags & GL_LOCAL_EXCL)
+ return 1;
+ next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder,
+ gh_list);
+ if (next_gh->gh_flags & GL_LOCAL_EXCL)
+ return 1;
+ }
+
+ list_move_tail(&gh->gh_list, &gl->gl_holders);
+ gh->gh_error = 0;
+ set_bit(HIF_HOLDER, &gh->gh_iflags);
+
+ complete(&gh->gh_wait);
+
+ return 0;
+}
+
+/**
+ * rq_demote - process a demote request in the queue
+ * @gh: the glock holder
+ *
+ * Returns: 1 if the queue is blocked
+ */
+
+static int rq_demote(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+
+ if (!list_empty(&gl->gl_holders))
+ return 1;
+
+ if (gl->gl_state == gh->gh_state || gl->gl_state == LM_ST_UNLOCKED) {
+ list_del_init(&gh->gh_list);
+ gh->gh_error = 0;
+ spin_unlock(&gl->gl_spin);
+ if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
+ gfs2_holder_put(gh);
+ else
+ complete(&gh->gh_wait);
+ spin_lock(&gl->gl_spin);
+ } else {
+ gl->gl_req_gh = gh;
+ set_bit(GLF_LOCK, &gl->gl_flags);
+ spin_unlock(&gl->gl_spin);
+
+ if (gh->gh_state == LM_ST_UNLOCKED ||
+ gl->gl_state != LM_ST_EXCLUSIVE)
+ glops->go_drop_th(gl);
+ else
+ glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags);
+
+ spin_lock(&gl->gl_spin);
+ }
+
+ return 0;
+}
+
+/**
+ * rq_greedy - process a queued request to drop greedy status
+ * @gh: the glock holder
+ *
+ * Returns: 1 if the queue is blocked
+ */
+
+static int rq_greedy(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+
+ list_del_init(&gh->gh_list);
+ /* gh->gh_error never examined. */
+ clear_bit(GLF_GREEDY, &gl->gl_flags);
+ spin_unlock(&gl->gl_spin);
+
+ gfs2_holder_uninit(gh);
+ kfree(container_of(gh, struct greedy, gr_gh));
+
+ spin_lock(&gl->gl_spin);
+
+ return 0;
+}
+
+/**
+ * run_queue - process holder structures on a glock
+ * @gl: the glock
+ *
+ */
+static void run_queue(struct gfs2_glock *gl)
+{
+ struct gfs2_holder *gh;
+ int blocked = 1;
+
+ for (;;) {
+ if (test_bit(GLF_LOCK, &gl->gl_flags))
+ break;
+
+ if (!list_empty(&gl->gl_waiters1)) {
+ gh = list_entry(gl->gl_waiters1.next,
+ struct gfs2_holder, gh_list);
+
+ if (test_bit(HIF_MUTEX, &gh->gh_iflags))
+ blocked = rq_mutex(gh);
+ else
+ gfs2_assert_warn(gl->gl_sbd, 0);
+
+ } else if (!list_empty(&gl->gl_waiters2) &&
+ !test_bit(GLF_SKIP_WAITERS2, &gl->gl_flags)) {
+ gh = list_entry(gl->gl_waiters2.next,
+ struct gfs2_holder, gh_list);
+
+ if (test_bit(HIF_DEMOTE, &gh->gh_iflags))
+ blocked = rq_demote(gh);
+ else if (test_bit(HIF_GREEDY, &gh->gh_iflags))
+ blocked = rq_greedy(gh);
+ else
+ gfs2_assert_warn(gl->gl_sbd, 0);
+
+ } else if (!list_empty(&gl->gl_waiters3)) {
+ gh = list_entry(gl->gl_waiters3.next,
+ struct gfs2_holder, gh_list);
+
+ if (test_bit(HIF_PROMOTE, &gh->gh_iflags))
+ blocked = rq_promote(gh);
+ else
+ gfs2_assert_warn(gl->gl_sbd, 0);
+
+ } else
+ break;
+
+ if (blocked)
+ break;
+ }
+}
+
+/**
+ * gfs2_glmutex_lock - acquire a local lock on a glock
+ * @gl: the glock
+ *
+ * Gives caller exclusive access to manipulate a glock structure.
+ */
+
+static void gfs2_glmutex_lock(struct gfs2_glock *gl)
+{
+ struct gfs2_holder gh;
+
+ gfs2_holder_init(gl, 0, 0, &gh);
+ set_bit(HIF_MUTEX, &gh.gh_iflags);
+
+ spin_lock(&gl->gl_spin);
+ if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
+ list_add_tail(&gh.gh_list, &gl->gl_waiters1);
+ } else {
+ gl->gl_owner = current;
+ gl->gl_ip = (unsigned long)__builtin_return_address(0);
+ complete(&gh.gh_wait);
+ }
+ spin_unlock(&gl->gl_spin);
+
+ wait_for_completion(&gh.gh_wait);
+ gfs2_holder_uninit(&gh);
+}
+
+/**
+ * gfs2_glmutex_trylock - try to acquire a local lock on a glock
+ * @gl: the glock
+ *
+ * Returns: 1 if the glock is acquired
+ */
+
+static int gfs2_glmutex_trylock(struct gfs2_glock *gl)
+{
+ int acquired = 1;
+
+ spin_lock(&gl->gl_spin);
+ if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
+ acquired = 0;
+ } else {
+ gl->gl_owner = current;
+ gl->gl_ip = (unsigned long)__builtin_return_address(0);
+ }
+ spin_unlock(&gl->gl_spin);
+
+ return acquired;
+}
+
+/**
+ * gfs2_glmutex_unlock - release a local lock on a glock
+ * @gl: the glock
+ *
+ */
+
+static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
+{
+ spin_lock(&gl->gl_spin);
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+ gl->gl_owner = NULL;
+ gl->gl_ip = 0;
+ run_queue(gl);
+ BUG_ON(!spin_is_locked(&gl->gl_spin));
+ spin_unlock(&gl->gl_spin);
+}
+
+/**
+ * handle_callback - add a demote request to a lock's queue
+ * @gl: the glock
+ * @state: the state the caller wants us to change to
+ *
+ * Note: This may fail sliently if we are out of memory.
+ */
+
+static void handle_callback(struct gfs2_glock *gl, unsigned int state)
+{
+ struct gfs2_holder *gh, *new_gh = NULL;
+
+restart:
+ spin_lock(&gl->gl_spin);
+
+ list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
+ if (test_bit(HIF_DEMOTE, &gh->gh_iflags) &&
+ gl->gl_req_gh != gh) {
+ if (gh->gh_state != state)
+ gh->gh_state = LM_ST_UNLOCKED;
+ goto out;
+ }
+ }
+
+ if (new_gh) {
+ list_add_tail(&new_gh->gh_list, &gl->gl_waiters2);
+ new_gh = NULL;
+ } else {
+ spin_unlock(&gl->gl_spin);
+
+ new_gh = gfs2_holder_get(gl, state, LM_FLAG_TRY, GFP_KERNEL);
+ if (!new_gh)
+ return;
+ set_bit(HIF_DEMOTE, &new_gh->gh_iflags);
+ set_bit(HIF_DEALLOC, &new_gh->gh_iflags);
+
+ goto restart;
+ }
+
+out:
+ spin_unlock(&gl->gl_spin);
+
+ if (new_gh)
+ gfs2_holder_put(new_gh);
+}
+
+void gfs2_glock_inode_squish(struct inode *inode)
+{
+ struct gfs2_holder gh;
+ struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
+ gfs2_holder_init(gl, LM_ST_UNLOCKED, 0, &gh);
+ set_bit(HIF_DEMOTE, &gh.gh_iflags);
+ spin_lock(&gl->gl_spin);
+ gfs2_assert(inode->i_sb->s_fs_info, list_empty(&gl->gl_holders));
+ list_add_tail(&gh.gh_list, &gl->gl_waiters2);
+ run_queue(gl);
+ spin_unlock(&gl->gl_spin);
+ wait_for_completion(&gh.gh_wait);
+ gfs2_holder_uninit(&gh);
+}
+
+/**
+ * state_change - record that the glock is now in a different state
+ * @gl: the glock
+ * @new_state the new state
+ *
+ */
+
+static void state_change(struct gfs2_glock *gl, unsigned int new_state)
+{
+ int held1, held2;
+
+ held1 = (gl->gl_state != LM_ST_UNLOCKED);
+ held2 = (new_state != LM_ST_UNLOCKED);
+
+ if (held1 != held2) {
+ if (held2)
+ gfs2_glock_hold(gl);
+ else
+ gfs2_glock_put(gl);
+ }
+
+ gl->gl_state = new_state;
+}
+
+/**
+ * xmote_bh - Called after the lock module is done acquiring a lock
+ * @gl: The glock in question
+ * @ret: the int returned from the lock module
+ *
+ */
+
+static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+ struct gfs2_holder *gh = gl->gl_req_gh;
+ int prev_state = gl->gl_state;
+ int op_done = 1;
+
+ gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+ gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+ gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
+
+ state_change(gl, ret & LM_OUT_ST_MASK);
+
+ if (prev_state != LM_ST_UNLOCKED && !(ret & LM_OUT_CACHEABLE)) {
+ if (glops->go_inval)
+ glops->go_inval(gl, DIO_METADATA | DIO_DATA);
+ } else if (gl->gl_state == LM_ST_DEFERRED) {
+ /* We might not want to do this here.
+ Look at moving to the inode glops. */
+ if (glops->go_inval)
+ glops->go_inval(gl, DIO_DATA);
+ }
+
+ /* Deal with each possible exit condition */
+
+ if (!gh)
+ gl->gl_stamp = jiffies;
+ else if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
+ spin_lock(&gl->gl_spin);
+ list_del_init(&gh->gh_list);
+ gh->gh_error = -EIO;
+ spin_unlock(&gl->gl_spin);
+ } else if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) {
+ spin_lock(&gl->gl_spin);
+ list_del_init(&gh->gh_list);
+ if (gl->gl_state == gh->gh_state ||
+ gl->gl_state == LM_ST_UNLOCKED) {
+ gh->gh_error = 0;
+ } else {
+ if (gfs2_assert_warn(sdp, gh->gh_flags &
+ (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) == -1)
+ fs_warn(sdp, "ret = 0x%.8X\n", ret);
+ gh->gh_error = GLR_TRYFAILED;
+ }
+ spin_unlock(&gl->gl_spin);
+
+ if (ret & LM_OUT_CANCELED)
+ handle_callback(gl, LM_ST_UNLOCKED);
+
+ } else if (ret & LM_OUT_CANCELED) {
+ spin_lock(&gl->gl_spin);
+ list_del_init(&gh->gh_list);
+ gh->gh_error = GLR_CANCELED;
+ spin_unlock(&gl->gl_spin);
+
+ } else if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
+ spin_lock(&gl->gl_spin);
+ list_move_tail(&gh->gh_list, &gl->gl_holders);
+ gh->gh_error = 0;
+ set_bit(HIF_HOLDER, &gh->gh_iflags);
+ spin_unlock(&gl->gl_spin);
+
+ set_bit(HIF_FIRST, &gh->gh_iflags);
+
+ op_done = 0;
+
+ } else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
+ spin_lock(&gl->gl_spin);
+ list_del_init(&gh->gh_list);
+ gh->gh_error = GLR_TRYFAILED;
+ spin_unlock(&gl->gl_spin);
+
+ } else {
+ if (gfs2_assert_withdraw(sdp, 0) == -1)
+ fs_err(sdp, "ret = 0x%.8X\n", ret);
+ }
+
+ if (glops->go_xmote_bh)
+ glops->go_xmote_bh(gl);
+
+ if (op_done) {
+ spin_lock(&gl->gl_spin);
+ gl->gl_req_gh = NULL;
+ gl->gl_req_bh = NULL;
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+ run_queue(gl);
+ spin_unlock(&gl->gl_spin);
+ }
+
+ gfs2_glock_put(gl);
+
+ if (gh) {
+ if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
+ gfs2_holder_put(gh);
+ else
+ complete(&gh->gh_wait);
+ }
+}
+
+/**
+ * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
+ * @gl: The glock in question
+ * @state: the requested state
+ * @flags: modifier flags to the lock call
+ *
+ */
+
+void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+ int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
+ LM_FLAG_NOEXP | LM_FLAG_ANY |
+ LM_FLAG_PRIORITY);
+ unsigned int lck_ret;
+
+ gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+ gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+ gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED);
+ gfs2_assert_warn(sdp, state != gl->gl_state);
+
+ if (gl->gl_state == LM_ST_EXCLUSIVE && glops->go_sync)
+ glops->go_sync(gl, DIO_METADATA | DIO_DATA | DIO_RELEASE);
+
+ gfs2_glock_hold(gl);
+ gl->gl_req_bh = xmote_bh;
+
+ lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags);
+
+ if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR)))
+ return;
+
+ if (lck_ret & LM_OUT_ASYNC)
+ gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC);
+ else
+ xmote_bh(gl, lck_ret);
+}
+
+/**
+ * drop_bh - Called after a lock module unlock completes
+ * @gl: the glock
+ * @ret: the return status
+ *
+ * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
+ * Doesn't drop the reference on the glock the top half took out
+ *
+ */
+
+static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+ struct gfs2_holder *gh = gl->gl_req_gh;
+
+ clear_bit(GLF_PREFETCH, &gl->gl_flags);
+
+ gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+ gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+ gfs2_assert_warn(sdp, !ret);
+
+ state_change(gl, LM_ST_UNLOCKED);
+
+ if (glops->go_inval)
+ glops->go_inval(gl, DIO_METADATA | DIO_DATA);
+
+ if (gh) {
+ spin_lock(&gl->gl_spin);
+ list_del_init(&gh->gh_list);
+ gh->gh_error = 0;
+ spin_unlock(&gl->gl_spin);
+ }
+
+ if (glops->go_drop_bh)
+ glops->go_drop_bh(gl);
+
+ spin_lock(&gl->gl_spin);
+ gl->gl_req_gh = NULL;
+ gl->gl_req_bh = NULL;
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+ run_queue(gl);
+ spin_unlock(&gl->gl_spin);
+
+ gfs2_glock_put(gl);
+
+ if (gh) {
+ if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
+ gfs2_holder_put(gh);
+ else
+ complete(&gh->gh_wait);
+ }
+}
+
+/**
+ * gfs2_glock_drop_th - call into the lock module to unlock a lock
+ * @gl: the glock
+ *
+ */
+
+void gfs2_glock_drop_th(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+ unsigned int ret;
+
+ gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+ gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+ gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
+
+ if (gl->gl_state == LM_ST_EXCLUSIVE && glops->go_sync)
+ glops->go_sync(gl, DIO_METADATA | DIO_DATA | DIO_RELEASE);
+
+ gfs2_glock_hold(gl);
+ gl->gl_req_bh = drop_bh;
+
+ ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
+
+ if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR)))
+ return;
+
+ if (!ret)
+ drop_bh(gl, ret);
+ else
+ gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC);
+}
+
+/**
+ * do_cancels - cancel requests for locks stuck waiting on an expire flag
+ * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock
+ *
+ * Don't cancel GL_NOCANCEL requests.
+ */
+
+static void do_cancels(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+
+ spin_lock(&gl->gl_spin);
+
+ while (gl->gl_req_gh != gh &&
+ !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
+ !list_empty(&gh->gh_list)) {
+ if (gl->gl_req_bh && !(gl->gl_req_gh &&
+ (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
+ spin_unlock(&gl->gl_spin);
+ gfs2_lm_cancel(gl->gl_sbd, gl->gl_lock);
+ msleep(100);
+ spin_lock(&gl->gl_spin);
+ } else {
+ spin_unlock(&gl->gl_spin);
+ msleep(100);
+ spin_lock(&gl->gl_spin);
+ }
+ }
+
+ spin_unlock(&gl->gl_spin);
+}
+
+/**
+ * glock_wait_internal - wait on a glock acquisition
+ * @gh: the glock holder
+ *
+ * Returns: 0 on success
+ */
+
+static int glock_wait_internal(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+
+ if (test_bit(HIF_ABORTED, &gh->gh_iflags))
+ return -EIO;
+
+ if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
+ spin_lock(&gl->gl_spin);
+ if (gl->gl_req_gh != gh &&
+ !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
+ !list_empty(&gh->gh_list)) {
+ list_del_init(&gh->gh_list);
+ gh->gh_error = GLR_TRYFAILED;
+ run_queue(gl);
+ spin_unlock(&gl->gl_spin);
+ return gh->gh_error;
+ }
+ spin_unlock(&gl->gl_spin);
+ }
+
+ if (gh->gh_flags & LM_FLAG_PRIORITY)
+ do_cancels(gh);
+
+ wait_for_completion(&gh->gh_wait);
+
+ if (gh->gh_error)
+ return gh->gh_error;
+
+ gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags));
+ gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state, gh->gh_state,
+ gh->gh_flags));
+
+ if (test_bit(HIF_FIRST, &gh->gh_iflags)) {
+ gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+
+ if (glops->go_lock) {
+ gh->gh_error = glops->go_lock(gh);
+ if (gh->gh_error) {
+ spin_lock(&gl->gl_spin);
+ list_del_init(&gh->gh_list);
+ spin_unlock(&gl->gl_spin);
+ }
+ }
+
+ spin_lock(&gl->gl_spin);
+ gl->gl_req_gh = NULL;
+ gl->gl_req_bh = NULL;
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+ run_queue(gl);
+ spin_unlock(&gl->gl_spin);
+ }
+
+ return gh->gh_error;
+}
+
+static inline struct gfs2_holder *
+find_holder_by_owner(struct list_head *head, struct task_struct *owner)
+{
+ struct gfs2_holder *gh;
+
+ list_for_each_entry(gh, head, gh_list) {
+ if (gh->gh_owner == owner)
+ return gh;
+ }
+
+ return NULL;
+}
+
+/**
+ * add_to_queue - Add a holder to the wait queue (but look for recursion)
+ * @gh: the holder structure to add
+ *
+ */
+
+static void add_to_queue(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+ struct gfs2_holder *existing;
+
+ BUG_ON(!gh->gh_owner);
+
+ existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner);
+ if (existing) {
+ print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
+ printk(KERN_INFO "pid : %d\n", existing->gh_owner->pid);
+ printk(KERN_INFO "lock type : %d lock state : %d\n",
+ existing->gh_gl->gl_name.ln_type, existing->gh_gl->gl_state);
+ print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
+ printk(KERN_INFO "pid : %d\n", gh->gh_owner->pid);
+ printk(KERN_INFO "lock type : %d lock state : %d\n",
+ gl->gl_name.ln_type, gl->gl_state);
+ BUG();
+ }
+
+ existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner);
+ if (existing) {
+ print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
+ print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
+ BUG();
+ }
+
+ if (gh->gh_flags & LM_FLAG_PRIORITY)
+ list_add(&gh->gh_list, &gl->gl_waiters3);
+ else
+ list_add_tail(&gh->gh_list, &gl->gl_waiters3);
+}
+
+/**
+ * gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock (acquire a glock)
+ * @gh: the holder structure
+ *
+ * if (gh->gh_flags & GL_ASYNC), this never returns an error
+ *
+ * Returns: 0, GLR_TRYFAILED, or errno on failure
+ */
+
+int gfs2_glock_nq(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ int error = 0;
+
+restart:
+ if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
+ set_bit(HIF_ABORTED, &gh->gh_iflags);
+ return -EIO;
+ }
+
+ set_bit(HIF_PROMOTE, &gh->gh_iflags);
+
+ spin_lock(&gl->gl_spin);
+ add_to_queue(gh);
+ run_queue(gl);
+ spin_unlock(&gl->gl_spin);
+
+ if (!(gh->gh_flags & GL_ASYNC)) {
+ error = glock_wait_internal(gh);
+ if (error == GLR_CANCELED) {
+ msleep(100);
+ goto restart;
+ }
+ }
+
+ clear_bit(GLF_PREFETCH, &gl->gl_flags);
+
+ if (error == GLR_TRYFAILED && (gh->gh_flags & GL_DUMP))
+ dump_glock(gl);
+
+ return error;
+}
+
+/**
+ * gfs2_glock_poll - poll to see if an async request has been completed
+ * @gh: the holder
+ *
+ * Returns: 1 if the request is ready to be gfs2_glock_wait()ed on
+ */
+
+int gfs2_glock_poll(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+ int ready = 0;
+
+ spin_lock(&gl->gl_spin);
+
+ if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+ ready = 1;
+ else if (list_empty(&gh->gh_list)) {
+ if (gh->gh_error == GLR_CANCELED) {
+ spin_unlock(&gl->gl_spin);
+ msleep(100);
+ if (gfs2_glock_nq(gh))
+ return 1;
+ return 0;
+ } else
+ ready = 1;
+ }
+
+ spin_unlock(&gl->gl_spin);
+
+ return ready;
+}
+
+/**
+ * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC
+ * @gh: the holder structure
+ *
+ * Returns: 0, GLR_TRYFAILED, or errno on failure
+ */
+
+int gfs2_glock_wait(struct gfs2_holder *gh)
+{
+ int error;
+
+ error = glock_wait_internal(gh);
+ if (error == GLR_CANCELED) {
+ msleep(100);
+ gh->gh_flags &= ~GL_ASYNC;
+ error = gfs2_glock_nq(gh);
+ }
+
+ return error;
+}
+
+/**
+ * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
+ * @gh: the glock holder
+ *
+ */
+
+void gfs2_glock_dq(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+
+ if (gh->gh_flags & GL_NOCACHE)
+ handle_callback(gl, LM_ST_UNLOCKED);
+
+ gfs2_glmutex_lock(gl);
+
+ spin_lock(&gl->gl_spin);
+ list_del_init(&gh->gh_list);
+
+ if (list_empty(&gl->gl_holders)) {
+ spin_unlock(&gl->gl_spin);
+
+ if (glops->go_unlock)
+ glops->go_unlock(gh);
+
+ gl->gl_stamp = jiffies;
+
+ spin_lock(&gl->gl_spin);
+ }
+
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+ run_queue(gl);
+ spin_unlock(&gl->gl_spin);
+}
+
+/**
+ * gfs2_glock_prefetch - Try to prefetch a glock
+ * @gl: the glock
+ * @state: the state to prefetch in
+ * @flags: flags passed to go_xmote_th()
+ *
+ */
+
+static void gfs2_glock_prefetch(struct gfs2_glock *gl, unsigned int state,
+ int flags)
+{
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+
+ spin_lock(&gl->gl_spin);
+
+ if (test_bit(GLF_LOCK, &gl->gl_flags) || !list_empty(&gl->gl_holders) ||
+ !list_empty(&gl->gl_waiters1) || !list_empty(&gl->gl_waiters2) ||
+ !list_empty(&gl->gl_waiters3) ||
+ relaxed_state_ok(gl->gl_state, state, flags)) {
+ spin_unlock(&gl->gl_spin);
+ return;
+ }
+
+ set_bit(GLF_PREFETCH, &gl->gl_flags);
+ set_bit(GLF_LOCK, &gl->gl_flags);
+ spin_unlock(&gl->gl_spin);
+
+ glops->go_xmote_th(gl, state, flags);
+}
+
+static void greedy_work(void *data)
+{
+ struct greedy *gr = data;
+ struct gfs2_holder *gh = &gr->gr_gh;
+ struct gfs2_glock *gl = gh->gh_gl;
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+
+ clear_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
+
+ if (glops->go_greedy)
+ glops->go_greedy(gl);
+
+ spin_lock(&gl->gl_spin);
+
+ if (list_empty(&gl->gl_waiters2)) {
+ clear_bit(GLF_GREEDY, &gl->gl_flags);
+ spin_unlock(&gl->gl_spin);
+ gfs2_holder_uninit(gh);
+ kfree(gr);
+ } else {
+ gfs2_glock_hold(gl);
+ list_add_tail(&gh->gh_list, &gl->gl_waiters2);
+ run_queue(gl);
+ spin_unlock(&gl->gl_spin);
+ gfs2_glock_put(gl);
+ }
+}
+
+/**
+ * gfs2_glock_be_greedy -
+ * @gl:
+ * @time:
+ *
+ * Returns: 0 if go_greedy will be called, 1 otherwise
+ */
+
+int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time)
+{
+ struct greedy *gr;
+ struct gfs2_holder *gh;
+
+ if (!time || gl->gl_sbd->sd_args.ar_localcaching ||
+ test_and_set_bit(GLF_GREEDY, &gl->gl_flags))
+ return 1;
+
+ gr = kmalloc(sizeof(struct greedy), GFP_KERNEL);
+ if (!gr) {
+ clear_bit(GLF_GREEDY, &gl->gl_flags);
+ return 1;
+ }
+ gh = &gr->gr_gh;
+
+ gfs2_holder_init(gl, 0, 0, gh);
+ set_bit(HIF_GREEDY, &gh->gh_iflags);
+ INIT_WORK(&gr->gr_work, greedy_work, gr);
+
+ set_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
+ schedule_delayed_work(&gr->gr_work, time);
+
+ return 0;
+}
+
+/**
+ * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it
+ * @gh: the holder structure
+ *
+ */
+
+void gfs2_glock_dq_uninit(struct gfs2_holder *gh)
+{
+ gfs2_glock_dq(gh);
+ gfs2_holder_uninit(gh);
+}
+
+/**
+ * gfs2_glock_nq_num - acquire a glock based on lock number
+ * @sdp: the filesystem
+ * @number: the lock number
+ * @glops: the glock operations for the type of glock
+ * @state: the state to acquire the glock in
+ * @flags: modifier flags for the aquisition
+ * @gh: the struct gfs2_holder
+ *
+ * Returns: errno
+ */
+
+int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
+ const struct gfs2_glock_operations *glops,
+ unsigned int state, int flags, struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl;
+ int error;
+
+ error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
+ if (!error) {
+ error = gfs2_glock_nq_init(gl, state, flags, gh);
+ gfs2_glock_put(gl);
+ }
+
+ return error;
+}
+
+/**
+ * glock_compare - Compare two struct gfs2_glock structures for sorting
+ * @arg_a: the first structure
+ * @arg_b: the second structure
+ *
+ */
+
+static int glock_compare(const void *arg_a, const void *arg_b)
+{
+ const struct gfs2_holder *gh_a = *(const struct gfs2_holder **)arg_a;
+ const struct gfs2_holder *gh_b = *(const struct gfs2_holder **)arg_b;
+ const struct lm_lockname *a = &gh_a->gh_gl->gl_name;
+ const struct lm_lockname *b = &gh_b->gh_gl->gl_name;
+
+ if (a->ln_number > b->ln_number)
+ return 1;
+ if (a->ln_number < b->ln_number)
+ return -1;
+ if (gh_a->gh_state == LM_ST_SHARED && gh_b->gh_state == LM_ST_EXCLUSIVE)
+ return 1;
+ if (!(gh_a->gh_flags & GL_LOCAL_EXCL) && (gh_b->gh_flags & GL_LOCAL_EXCL))
+ return 1;
+ return 0;
+}
+
+/**
+ * nq_m_sync - synchonously acquire more than one glock in deadlock free order
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ * Returns: 0 on success (all glocks acquired),
+ * errno on failure (no glocks acquired)
+ */
+
+static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs,
+ struct gfs2_holder **p)
+{
+ unsigned int x;
+ int error = 0;
+
+ for (x = 0; x < num_gh; x++)
+ p[x] = &ghs[x];
+
+ sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL);
+
+ for (x = 0; x < num_gh; x++) {
+ p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
+
+ error = gfs2_glock_nq(p[x]);
+ if (error) {
+ while (x--)
+ gfs2_glock_dq(p[x]);
+ break;
+ }
+ }
+
+ return error;
+}
+
+/**
+ * gfs2_glock_nq_m - acquire multiple glocks
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ * Figure out how big an impact this function has. Either:
+ * 1) Replace this code with code that calls gfs2_glock_prefetch()
+ * 2) Forget async stuff and just call nq_m_sync()
+ * 3) Leave it like it is
+ *
+ * Returns: 0 on success (all glocks acquired),
+ * errno on failure (no glocks acquired)
+ */
+
+int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+ int *e;
+ unsigned int x;
+ int borked = 0, serious = 0;
+ int error = 0;
+
+ if (!num_gh)
+ return 0;
+
+ if (num_gh == 1) {
+ ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
+ return gfs2_glock_nq(ghs);
+ }
+
+ e = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
+ if (!e)
+ return -ENOMEM;
+
+ for (x = 0; x < num_gh; x++) {
+ ghs[x].gh_flags |= LM_FLAG_TRY | GL_ASYNC;
+ error = gfs2_glock_nq(&ghs[x]);
+ if (error) {
+ borked = 1;
+ serious = error;
+ num_gh = x;
+ break;
+ }
+ }
+
+ for (x = 0; x < num_gh; x++) {
+ error = e[x] = glock_wait_internal(&ghs[x]);
+ if (error) {
+ borked = 1;
+ if (error != GLR_TRYFAILED && error != GLR_CANCELED)
+ serious = error;
+ }
+ }
+
+ if (!borked) {
+ kfree(e);
+ return 0;
+ }
+
+ for (x = 0; x < num_gh; x++)
+ if (!e[x])
+ gfs2_glock_dq(&ghs[x]);
+
+ if (serious)
+ error = serious;
+ else {
+ for (x = 0; x < num_gh; x++)
+ gfs2_holder_reinit(ghs[x].gh_state, ghs[x].gh_flags,
+ &ghs[x]);
+ error = nq_m_sync(num_gh, ghs, (struct gfs2_holder **)e);
+ }
+
+ kfree(e);
+
+ return error;
+}
+
+/**
+ * gfs2_glock_dq_m - release multiple glocks
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ */
+
+void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+ unsigned int x;
+
+ for (x = 0; x < num_gh; x++)
+ gfs2_glock_dq(&ghs[x]);
+}
+
+/**
+ * gfs2_glock_dq_uninit_m - release multiple glocks
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ */
+
+void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+ unsigned int x;
+
+ for (x = 0; x < num_gh; x++)
+ gfs2_glock_dq_uninit(&ghs[x]);
+}
+
+/**
+ * gfs2_glock_prefetch_num - prefetch a glock based on lock number
+ * @sdp: the filesystem
+ * @number: the lock number
+ * @glops: the glock operations for the type of glock
+ * @state: the state to acquire the glock in
+ * @flags: modifier flags for the aquisition
+ *
+ * Returns: errno
+ */
+
+void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, u64 number,
+ const struct gfs2_glock_operations *glops,
+ unsigned int state, int flags)
+{
+ struct gfs2_glock *gl;
+ int error;
+
+ if (atomic_read(&sdp->sd_reclaim_count) <
+ gfs2_tune_get(sdp, gt_reclaim_limit)) {
+ error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
+ if (!error) {
+ gfs2_glock_prefetch(gl, state, flags);
+ gfs2_glock_put(gl);
+ }
+ }
+}
+
+/**
+ * gfs2_lvb_hold - attach a LVB from a glock
+ * @gl: The glock in question
+ *
+ */
+
+int gfs2_lvb_hold(struct gfs2_glock *gl)
+{
+ int error;
+
+ gfs2_glmutex_lock(gl);
+
+ if (!atomic_read(&gl->gl_lvb_count)) {
+ error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
+ if (error) {
+ gfs2_glmutex_unlock(gl);
+ return error;
+ }
+ gfs2_glock_hold(gl);
+ }
+ atomic_inc(&gl->gl_lvb_count);
+
+ gfs2_glmutex_unlock(gl);
+
+ return 0;
+}
+
+/**
+ * gfs2_lvb_unhold - detach a LVB from a glock
+ * @gl: The glock in question
+ *
+ */
+
+void gfs2_lvb_unhold(struct gfs2_glock *gl)
+{
+ gfs2_glock_hold(gl);
+ gfs2_glmutex_lock(gl);
+
+ gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
+ if (atomic_dec_and_test(&gl->gl_lvb_count)) {
+ gfs2_lm_unhold_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb);
+ gl->gl_lvb = NULL;
+ gfs2_glock_put(gl);
+ }
+
+ gfs2_glmutex_unlock(gl);
+ gfs2_glock_put(gl);
+}
+
+static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
+ unsigned int state)
+{
+ struct gfs2_glock *gl;
+
+ gl = gfs2_glock_find(sdp, name);
+ if (!gl)
+ return;
+
+ if (gl->gl_ops->go_callback)
+ gl->gl_ops->go_callback(gl, state);
+ handle_callback(gl, state);
+
+ spin_lock(&gl->gl_spin);
+ run_queue(gl);
+ spin_unlock(&gl->gl_spin);
+
+ gfs2_glock_put(gl);
+}
+
+/**
+ * gfs2_glock_cb - Callback used by locking module
+ * @sdp: Pointer to the superblock
+ * @type: Type of callback
+ * @data: Type dependent data pointer
+ *
+ * Called by the locking module when it wants to tell us something.
+ * Either we need to drop a lock, one of our ASYNC requests completed, or
+ * a journal from another client needs to be recovered.
+ */
+
+void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
+{
+ struct gfs2_sbd *sdp = cb_data;
+
+ switch (type) {
+ case LM_CB_NEED_E:
+ blocking_cb(sdp, data, LM_ST_UNLOCKED);
+ return;
+
+ case LM_CB_NEED_D:
+ blocking_cb(sdp, data, LM_ST_DEFERRED);
+ return;
+
+ case LM_CB_NEED_S:
+ blocking_cb(sdp, data, LM_ST_SHARED);
+ return;
+
+ case LM_CB_ASYNC: {
+ struct lm_async_cb *async = data;
+ struct gfs2_glock *gl;
+
+ gl = gfs2_glock_find(sdp, &async->lc_name);
+ if (gfs2_assert_warn(sdp, gl))
+ return;
+ if (!gfs2_assert_warn(sdp, gl->gl_req_bh))
+ gl->gl_req_bh(gl, async->lc_ret);
+ gfs2_glock_put(gl);
+ return;
+ }
+
+ case LM_CB_NEED_RECOVERY:
+ gfs2_jdesc_make_dirty(sdp, *(unsigned int *)data);
+ if (sdp->sd_recoverd_process)
+ wake_up_process(sdp->sd_recoverd_process);
+ return;
+
+ case LM_CB_DROPLOCKS:
+ gfs2_gl_hash_clear(sdp, NO_WAIT);
+ gfs2_quota_scan(sdp);
+ return;
+
+ default:
+ gfs2_assert_warn(sdp, 0);
+ return;
+ }
+}
+
+/**
+ * demote_ok - Check to see if it's ok to unlock a glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+
+static int demote_ok(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+ int demote = 1;
+
+ if (test_bit(GLF_STICKY, &gl->gl_flags))
+ demote = 0;
+ else if (test_bit(GLF_PREFETCH, &gl->gl_flags))
+ demote = time_after_eq(jiffies, gl->gl_stamp +
+ gfs2_tune_get(sdp, gt_prefetch_secs) * HZ);
+ else if (glops->go_demote_ok)
+ demote = glops->go_demote_ok(gl);
+
+ return demote;
+}
+
+/**
+ * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
+ * @gl: the glock
+ *
+ */
+
+void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+
+ spin_lock(&sdp->sd_reclaim_lock);
+ if (list_empty(&gl->gl_reclaim)) {
+ gfs2_glock_hold(gl);
+ list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
+ atomic_inc(&sdp->sd_reclaim_count);
+ }
+ spin_unlock(&sdp->sd_reclaim_lock);
+
+ wake_up(&sdp->sd_reclaim_wq);
+}
+
+/**
+ * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list
+ * @sdp: the filesystem
+ *
+ * Called from gfs2_glockd() glock reclaim daemon, or when promoting a
+ * different glock and we notice that there are a lot of glocks in the
+ * reclaim list.
+ *
+ */
+
+void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
+{
+ struct gfs2_glock *gl;
+
+ spin_lock(&sdp->sd_reclaim_lock);
+ if (list_empty(&sdp->sd_reclaim_list)) {
+ spin_unlock(&sdp->sd_reclaim_lock);
+ return;
+ }
+ gl = list_entry(sdp->sd_reclaim_list.next,
+ struct gfs2_glock, gl_reclaim);
+ list_del_init(&gl->gl_reclaim);
+ spin_unlock(&sdp->sd_reclaim_lock);
+
+ atomic_dec(&sdp->sd_reclaim_count);
+ atomic_inc(&sdp->sd_reclaimed);
+
+ if (gfs2_glmutex_trylock(gl)) {
+ if (queue_empty(gl, &gl->gl_holders) &&
+ gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
+ handle_callback(gl, LM_ST_UNLOCKED);
+ gfs2_glmutex_unlock(gl);
+ }
+
+ gfs2_glock_put(gl);
+}
+
+/**
+ * examine_bucket - Call a function for glock in a hash bucket
+ * @examiner: the function
+ * @sdp: the filesystem
+ * @bucket: the bucket
+ *
+ * Returns: 1 if the bucket has entries
+ */
+
+static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp,
+ unsigned int hash)
+{
+ struct gfs2_glock *gl, *prev = NULL;
+ int has_entries = 0;
+ struct hlist_head *head = &gl_hash_table[hash].hb_list;
+
+ read_lock(gl_lock_addr(hash));
+ /* Can't use hlist_for_each_entry - don't want prefetch here */
+ if (hlist_empty(head))
+ goto out;
+ gl = list_entry(head->first, struct gfs2_glock, gl_list);
+ while(1) {
+ if (gl->gl_sbd == sdp) {
+ gfs2_glock_hold(gl);
+ read_unlock(gl_lock_addr(hash));
+ if (prev)
+ gfs2_glock_put(prev);
+ prev = gl;
+ examiner(gl);
+ has_entries = 1;
+ read_lock(gl_lock_addr(hash));
+ }
+ if (gl->gl_list.next == NULL)
+ break;
+ gl = list_entry(gl->gl_list.next, struct gfs2_glock, gl_list);
+ }
+out:
+ read_unlock(gl_lock_addr(hash));
+ if (prev)
+ gfs2_glock_put(prev);
+ return has_entries;
+}
+
+/**
+ * scan_glock - look at a glock and see if we can reclaim it
+ * @gl: the glock to look at
+ *
+ */
+
+static void scan_glock(struct gfs2_glock *gl)
+{
+ if (gl->gl_ops == &gfs2_inode_glops)
+ return;
+
+ if (gfs2_glmutex_trylock(gl)) {
+ if (queue_empty(gl, &gl->gl_holders) &&
+ gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
+ goto out_schedule;
+ gfs2_glmutex_unlock(gl);
+ }
+ return;
+
+out_schedule:
+ gfs2_glmutex_unlock(gl);
+ gfs2_glock_schedule_for_reclaim(gl);
+}
+
+/**
+ * gfs2_scand_internal - Look for glocks and inodes to toss from memory
+ * @sdp: the filesystem
+ *
+ */
+
+void gfs2_scand_internal(struct gfs2_sbd *sdp)
+{
+ unsigned int x;
+
+ for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
+ examine_bucket(scan_glock, sdp, x);
+}
+
+/**
+ * clear_glock - look at a glock and see if we can free it from glock cache
+ * @gl: the glock to look at
+ *
+ */
+
+static void clear_glock(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ int released;
+
+ spin_lock(&sdp->sd_reclaim_lock);
+ if (!list_empty(&gl->gl_reclaim)) {
+ list_del_init(&gl->gl_reclaim);
+ atomic_dec(&sdp->sd_reclaim_count);
+ spin_unlock(&sdp->sd_reclaim_lock);
+ released = gfs2_glock_put(gl);
+ gfs2_assert(sdp, !released);
+ } else {
+ spin_unlock(&sdp->sd_reclaim_lock);
+ }
+
+ if (gfs2_glmutex_trylock(gl)) {
+ if (queue_empty(gl, &gl->gl_holders) &&
+ gl->gl_state != LM_ST_UNLOCKED)
+ handle_callback(gl, LM_ST_UNLOCKED);
+ gfs2_glmutex_unlock(gl);
+ }
+}
+
+/**
+ * gfs2_gl_hash_clear - Empty out the glock hash table
+ * @sdp: the filesystem
+ * @wait: wait until it's all gone
+ *
+ * Called when unmounting the filesystem, or when inter-node lock manager
+ * requests DROPLOCKS because it is running out of capacity.
+ */
+
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
+{
+ unsigned long t;
+ unsigned int x;
+ int cont;
+
+ t = jiffies;
+
+ for (;;) {
+ cont = 0;
+ for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
+ if (examine_bucket(clear_glock, sdp, x))
+ cont = 1;
+ }
+
+ if (!wait || !cont)
+ break;
+
+ if (time_after_eq(jiffies,
+ t + gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
+ fs_warn(sdp, "Unmount seems to be stalled. "
+ "Dumping lock state...\n");
+ gfs2_dump_lockstate(sdp);
+ t = jiffies;
+ }
+
+ invalidate_inodes(sdp->sd_vfs);
+ msleep(10);
+ }
+}
+
+/*
+ * Diagnostic routines to help debug distributed deadlock
+ */
+
+/**
+ * dump_holder - print information about a glock holder
+ * @str: a string naming the type of holder
+ * @gh: the glock holder
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+
+static int dump_holder(char *str, struct gfs2_holder *gh)
+{
+ unsigned int x;
+ int error = -ENOBUFS;
+
+ printk(KERN_INFO " %s\n", str);
+ printk(KERN_INFO " owner = %ld\n",
+ (gh->gh_owner) ? (long)gh->gh_owner->pid : -1);
+ printk(KERN_INFO " gh_state = %u\n", gh->gh_state);
+ printk(KERN_INFO " gh_flags =");
+ for (x = 0; x < 32; x++)
+ if (gh->gh_flags & (1 << x))
+ printk(" %u", x);
+ printk(" \n");
+ printk(KERN_INFO " error = %d\n", gh->gh_error);
+ printk(KERN_INFO " gh_iflags =");
+ for (x = 0; x < 32; x++)
+ if (test_bit(x, &gh->gh_iflags))
+ printk(" %u", x);
+ printk(" \n");
+ print_symbol(KERN_INFO " initialized at: %s\n", gh->gh_ip);
+
+ error = 0;
+
+ return error;
+}
+
+/**
+ * dump_inode - print information about an inode
+ * @ip: the inode
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+
+static int dump_inode(struct gfs2_inode *ip)
+{
+ unsigned int x;
+ int error = -ENOBUFS;
+
+ printk(KERN_INFO " Inode:\n");
+ printk(KERN_INFO " num = %llu %llu\n",
+ (unsigned long long)ip->i_num.no_formal_ino,
+ (unsigned long long)ip->i_num.no_addr);
+ printk(KERN_INFO " type = %u\n", IF2DT(ip->i_di.di_mode));
+ printk(KERN_INFO " i_flags =");
+ for (x = 0; x < 32; x++)
+ if (test_bit(x, &ip->i_flags))
+ printk(" %u", x);
+ printk(" \n");
+
+ error = 0;
+
+ return error;
+}
+
+/**
+ * dump_glock - print information about a glock
+ * @gl: the glock
+ * @count: where we are in the buffer
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+
+static int dump_glock(struct gfs2_glock *gl)
+{
+ struct gfs2_holder *gh;
+ unsigned int x;
+ int error = -ENOBUFS;
+
+ spin_lock(&gl->gl_spin);
+
+ printk(KERN_INFO "Glock 0x%p (%u, %llu)\n", gl, gl->gl_name.ln_type,
+ (unsigned long long)gl->gl_name.ln_number);
+ printk(KERN_INFO " gl_flags =");
+ for (x = 0; x < 32; x++) {
+ if (test_bit(x, &gl->gl_flags))
+ printk(" %u", x);
+ }
+ printk(" \n");
+ printk(KERN_INFO " gl_ref = %d\n", atomic_read(&gl->gl_ref));
+ printk(KERN_INFO " gl_state = %u\n", gl->gl_state);
+ printk(KERN_INFO " gl_owner = %s\n", gl->gl_owner->comm);
+ print_symbol(KERN_INFO " gl_ip = %s\n", gl->gl_ip);
+ printk(KERN_INFO " req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
+ printk(KERN_INFO " req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
+ printk(KERN_INFO " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
+ printk(KERN_INFO " object = %s\n", (gl->gl_object) ? "yes" : "no");
+ printk(KERN_INFO " le = %s\n",
+ (list_empty(&gl->gl_le.le_list)) ? "no" : "yes");
+ printk(KERN_INFO " reclaim = %s\n",
+ (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
+ if (gl->gl_aspace)
+ printk(KERN_INFO " aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
+ gl->gl_aspace->i_mapping->nrpages);
+ else
+ printk(KERN_INFO " aspace = no\n");
+ printk(KERN_INFO " ail = %d\n", atomic_read(&gl->gl_ail_count));
+ if (gl->gl_req_gh) {
+ error = dump_holder("Request", gl->gl_req_gh);
+ if (error)
+ goto out;
+ }
+ list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+ error = dump_holder("Holder", gh);
+ if (error)
+ goto out;
+ }
+ list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
+ error = dump_holder("Waiter1", gh);
+ if (error)
+ goto out;
+ }
+ list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
+ error = dump_holder("Waiter2", gh);
+ if (error)
+ goto out;
+ }
+ list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
+ error = dump_holder("Waiter3", gh);
+ if (error)
+ goto out;
+ }
+ if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
+ if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
+ list_empty(&gl->gl_holders)) {
+ error = dump_inode(gl->gl_object);
+ if (error)
+ goto out;
+ } else {
+ error = -ENOBUFS;
+ printk(KERN_INFO " Inode: busy\n");
+ }
+ }
+
+ error = 0;
+
+out:
+ spin_unlock(&gl->gl_spin);
+ return error;
+}
+
+/**
+ * gfs2_dump_lockstate - print out the current lockstate
+ * @sdp: the filesystem
+ * @ub: the buffer to copy the information into
+ *
+ * If @ub is NULL, dump the lockstate to the console.
+ *
+ */
+
+static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
+{
+ struct gfs2_glock *gl;
+ struct hlist_node *h;
+ unsigned int x;
+ int error = 0;
+
+ for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
+
+ read_lock(gl_lock_addr(x));
+
+ hlist_for_each_entry(gl, h, &gl_hash_table[x].hb_list, gl_list) {
+ if (gl->gl_sbd != sdp)
+ continue;
+
+ error = dump_glock(gl);
+ if (error)
+ break;
+ }
+
+ read_unlock(gl_lock_addr(x));
+
+ if (error)
+ break;
+ }
+
+
+ return error;
+}
+
+int __init gfs2_glock_init(void)
+{
+ unsigned i;
+ for(i = 0; i < GFS2_GL_HASH_SIZE; i++) {
+ INIT_HLIST_HEAD(&gl_hash_table[i].hb_list);
+ }
+#ifdef GL_HASH_LOCK_SZ
+ for(i = 0; i < GL_HASH_LOCK_SZ; i++) {
+ rwlock_init(&gl_hash_locks[i]);
+ }
+#endif
+ return 0;
+}
+
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
new file mode 100644
index 000000000000..2b2a889ee2cc
--- /dev/null
+++ b/fs/gfs2/glock.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __GLOCK_DOT_H__
+#define __GLOCK_DOT_H__
+
+#include "incore.h"
+
+/* Flags for lock requests; used in gfs2_holder gh_flag field.
+ From lm_interface.h:
+#define LM_FLAG_TRY 0x00000001
+#define LM_FLAG_TRY_1CB 0x00000002
+#define LM_FLAG_NOEXP 0x00000004
+#define LM_FLAG_ANY 0x00000008
+#define LM_FLAG_PRIORITY 0x00000010 */
+
+#define GL_LOCAL_EXCL 0x00000020
+#define GL_ASYNC 0x00000040
+#define GL_EXACT 0x00000080
+#define GL_SKIP 0x00000100
+#define GL_ATIME 0x00000200
+#define GL_NOCACHE 0x00000400
+#define GL_NOCANCEL 0x00001000
+#define GL_AOP 0x00004000
+#define GL_DUMP 0x00008000
+
+#define GLR_TRYFAILED 13
+#define GLR_CANCELED 14
+
+static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
+{
+ struct gfs2_holder *gh;
+ int locked = 0;
+
+ /* Look in glock's list of holders for one with current task as owner */
+ spin_lock(&gl->gl_spin);
+ list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+ if (gh->gh_owner == current) {
+ locked = 1;
+ break;
+ }
+ }
+ spin_unlock(&gl->gl_spin);
+
+ return locked;
+}
+
+static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl)
+{
+ return gl->gl_state == LM_ST_EXCLUSIVE;
+}