linux/fs/ocfs2/stackglue.h

307 lines
9.3 KiB
C
Raw Normal View History

/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* stackglue.h
*
* Glue to the underlying cluster stack.
*
* Copyright (C) 2007 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation, version 2.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#ifndef STACKGLUE_H
#define STACKGLUE_H
#include <linux/types.h>
#include <linux/list.h>
#include <linux/dlmconstants.h>
#include "dlm/dlmapi.h"
#include <linux/dlm.h>
/* Needed for plock-related prototypes */
struct file;
struct file_lock;
/*
* dlmconstants.h does not have a LOCAL flag. We hope to remove it
* some day, but right now we need it. Let's fake it. This value is larger
* than any flag in dlmconstants.h.
*/
#define DLM_LKF_LOCAL 0x00100000
/*
* This shadows DLM_LOCKSPACE_LEN in fs/dlm/dlm_internal.h. That probably
* wants to be in a public header.
*/
#define GROUP_NAME_MAX 64
ocfs2: add clustername to cluster connection This is an effort of removing ocfs2_controld.pcmk and getting ocfs2 DLM handling up to the times with respect to DLM (>=4.0.1) and corosync (2.3.x). AFAIK, cman also is being phased out for a unified corosync cluster stack. fs/dlm performs all the functions with respect to fencing and node management and provides the API's to do so for ocfs2. For all future references, DLM stands for fs/dlm code. The advantages are: + No need to run an additional userspace daemon (ocfs2_controld) + No controld device handling and controld protocol + Shifting responsibilities of node management to DLM layer For backward compatibility, we are keeping the controld handling code. Once enough time has passed we can remove a significant portion of the code. This was tested by using the kernel with changes on older unmodified tools. The kernel used ocfs2_controld as expected, and displayed the appropriate warning message. This feature requires modification in the userspace ocfs2-tools. The changes can be found at: https://github.com/goldwynr/ocfs2-tools branch: nocontrold Currently, not many checks are present in the userspace code, but that would change soon. This patch (of 6): Add clustername to cluster connection. Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com> Reviewed-by: Mark Fasheh <mfasheh@suse.de> Cc: Joel Becker <jlbec@evilplan.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-01-22 07:48:21 +08:00
/* This shadows OCFS2_CLUSTER_NAME_LEN */
#define CLUSTER_NAME_MAX 16
/*
* ocfs2_protocol_version changes when ocfs2 does something different in
* its inter-node behavior. See dlmglue.c for more information.
*/
struct ocfs2_protocol_version {
u8 pv_major;
u8 pv_minor;
};
/*
* The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only
* has a pointer to separately allocated lvb space. This struct exists only to
* include in the lksb union to make space for a combined dlm_lksb and lvb.
*/
struct fsdlm_lksb_plus_lvb {
struct dlm_lksb lksb;
char lvb[DLM_LVB_LEN];
};
/*
* A union of all lock status structures. We define it here so that the
* size of the union is known. Lock status structures are embedded in
* ocfs2 inodes.
*/
struct ocfs2_cluster_connection;
struct ocfs2_dlm_lksb {
union {
struct dlm_lockstatus lksb_o2dlm;
struct dlm_lksb lksb_fsdlm;
struct fsdlm_lksb_plus_lvb padding;
};
struct ocfs2_cluster_connection *lksb_conn;
};
/*
* The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
*/
struct ocfs2_locking_protocol {
struct ocfs2_protocol_version lp_max_version;
void (*lp_lock_ast)(struct ocfs2_dlm_lksb *lksb);
void (*lp_blocking_ast)(struct ocfs2_dlm_lksb *lksb, int level);
void (*lp_unlock_ast)(struct ocfs2_dlm_lksb *lksb, int error);
};
/*
* A cluster connection. Mostly opaque to ocfs2, the connection holds
* state for the underlying stack. ocfs2 does use cc_version to determine
* locking compatibility.
*/
struct ocfs2_cluster_connection {
ocfs2: add clustername to cluster connection This is an effort of removing ocfs2_controld.pcmk and getting ocfs2 DLM handling up to the times with respect to DLM (>=4.0.1) and corosync (2.3.x). AFAIK, cman also is being phased out for a unified corosync cluster stack. fs/dlm performs all the functions with respect to fencing and node management and provides the API's to do so for ocfs2. For all future references, DLM stands for fs/dlm code. The advantages are: + No need to run an additional userspace daemon (ocfs2_controld) + No controld device handling and controld protocol + Shifting responsibilities of node management to DLM layer For backward compatibility, we are keeping the controld handling code. Once enough time has passed we can remove a significant portion of the code. This was tested by using the kernel with changes on older unmodified tools. The kernel used ocfs2_controld as expected, and displayed the appropriate warning message. This feature requires modification in the userspace ocfs2-tools. The changes can be found at: https://github.com/goldwynr/ocfs2-tools branch: nocontrold Currently, not many checks are present in the userspace code, but that would change soon. This patch (of 6): Add clustername to cluster connection. Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com> Reviewed-by: Mark Fasheh <mfasheh@suse.de> Cc: Joel Becker <jlbec@evilplan.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-01-22 07:48:21 +08:00
char cc_name[GROUP_NAME_MAX + 1];
int cc_namelen;
ocfs2: add clustername to cluster connection This is an effort of removing ocfs2_controld.pcmk and getting ocfs2 DLM handling up to the times with respect to DLM (>=4.0.1) and corosync (2.3.x). AFAIK, cman also is being phased out for a unified corosync cluster stack. fs/dlm performs all the functions with respect to fencing and node management and provides the API's to do so for ocfs2. For all future references, DLM stands for fs/dlm code. The advantages are: + No need to run an additional userspace daemon (ocfs2_controld) + No controld device handling and controld protocol + Shifting responsibilities of node management to DLM layer For backward compatibility, we are keeping the controld handling code. Once enough time has passed we can remove a significant portion of the code. This was tested by using the kernel with changes on older unmodified tools. The kernel used ocfs2_controld as expected, and displayed the appropriate warning message. This feature requires modification in the userspace ocfs2-tools. The changes can be found at: https://github.com/goldwynr/ocfs2-tools branch: nocontrold Currently, not many checks are present in the userspace code, but that would change soon. This patch (of 6): Add clustername to cluster connection. Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com> Reviewed-by: Mark Fasheh <mfasheh@suse.de> Cc: Joel Becker <jlbec@evilplan.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-01-22 07:48:21 +08:00
char cc_cluster_name[CLUSTER_NAME_MAX + 1];
int cc_cluster_name_len;
struct ocfs2_protocol_version cc_version;
struct ocfs2_locking_protocol *cc_proto;
void (*cc_recovery_handler)(int node_num, void *recovery_data);
void *cc_recovery_data;
void *cc_lockspace;
void *cc_private;
};
/*
* Each cluster stack implements the stack operations structure. Not used
* in the ocfs2 code, the stackglue code translates generic cluster calls
* into stack operations.
*/
struct ocfs2_stack_operations {
/*
* The fs code calls ocfs2_cluster_connect() to attach a new
* filesystem to the cluster stack. The ->connect() op is passed
* an ocfs2_cluster_connection with the name and recovery field
* filled in.
*
* The stack must set up any notification mechanisms and create
* the filesystem lockspace in the DLM. The lockspace should be
* stored on cc_lockspace. Any other information can be stored on
* cc_private.
*
* ->connect() must not return until it is guaranteed that
*
* - Node down notifications for the filesystem will be received
* and passed to conn->cc_recovery_handler().
* - Locking requests for the filesystem will be processed.
*/
int (*connect)(struct ocfs2_cluster_connection *conn);
/*
* The fs code calls ocfs2_cluster_disconnect() when a filesystem
* no longer needs cluster services. All DLM locks have been
* dropped, and recovery notification is being ignored by the
* fs code. The stack must disengage from the DLM and discontinue
* recovery notification.
*
* Once ->disconnect() has returned, the connection structure will
* be freed. Thus, a stack must not return from ->disconnect()
* until it will no longer reference the conn pointer.
*
* Once this call returns, the stack glue will be dropping this
* connection's reference on the module.
*/
int (*disconnect)(struct ocfs2_cluster_connection *conn);
/*
* ->this_node() returns the cluster's unique identifier for the
* local node.
*/
int (*this_node)(struct ocfs2_cluster_connection *conn,
unsigned int *node);
/*
* Call the underlying dlm lock function. The ->dlm_lock()
* callback should convert the flags and mode as appropriate.
*
* ast and bast functions are not part of the call because the
* stack will likely want to wrap ast and bast calls before passing
* them to stack->sp_proto. There is no astarg. The lksb will
* be passed back to the ast and bast functions. The caller can
* use this to find their object.
*/
int (*dlm_lock)(struct ocfs2_cluster_connection *conn,
int mode,
struct ocfs2_dlm_lksb *lksb,
u32 flags,
void *name,
unsigned int namelen);
/*
* Call the underlying dlm unlock function. The ->dlm_unlock()
* function should convert the flags as appropriate.
*
* The unlock ast is not passed, as the stack will want to wrap
* it before calling stack->sp_proto->lp_unlock_ast(). There is
* no astarg. The lksb will be passed back to the unlock ast
* function. The caller can use this to find their object.
*/
int (*dlm_unlock)(struct ocfs2_cluster_connection *conn,
struct ocfs2_dlm_lksb *lksb,
u32 flags);
/*
* Return the status of the current lock status block. The fs
* code should never dereference the union. The ->lock_status()
* callback pulls out the stack-specific lksb, converts the status
* to a proper errno, and returns it.
*/
int (*lock_status)(struct ocfs2_dlm_lksb *lksb);
/*
* Return non-zero if the LVB is valid.
*/
int (*lvb_valid)(struct ocfs2_dlm_lksb *lksb);
/*
* Pull the lvb pointer off of the stack-specific lksb.
*/
void *(*lock_lvb)(struct ocfs2_dlm_lksb *lksb);
/*
* Cluster-aware posix locks
*
* This is NULL for stacks which do not support posix locks.
*/
int (*plock)(struct ocfs2_cluster_connection *conn,
u64 ino,
struct file *file,
int cmd,
struct file_lock *fl);
/*
* This is an optoinal debugging hook. If provided, the
* stack can dump debugging information about this lock.
*/
void (*dump_lksb)(struct ocfs2_dlm_lksb *lksb);
};
/*
* Each stack plugin must describe itself by registering a
* ocfs2_stack_plugin structure. This is only seen by stackglue and the
* stack driver.
*/
struct ocfs2_stack_plugin {
char *sp_name;
struct ocfs2_stack_operations *sp_ops;
struct module *sp_owner;
/* These are managed by the stackglue code. */
struct list_head sp_list;
unsigned int sp_count;
struct ocfs2_protocol_version sp_max_proto;
};
/* Used by the filesystem */
int ocfs2_cluster_connect(const char *stack_name,
ocfs2: add clustername to cluster connection This is an effort of removing ocfs2_controld.pcmk and getting ocfs2 DLM handling up to the times with respect to DLM (>=4.0.1) and corosync (2.3.x). AFAIK, cman also is being phased out for a unified corosync cluster stack. fs/dlm performs all the functions with respect to fencing and node management and provides the API's to do so for ocfs2. For all future references, DLM stands for fs/dlm code. The advantages are: + No need to run an additional userspace daemon (ocfs2_controld) + No controld device handling and controld protocol + Shifting responsibilities of node management to DLM layer For backward compatibility, we are keeping the controld handling code. Once enough time has passed we can remove a significant portion of the code. This was tested by using the kernel with changes on older unmodified tools. The kernel used ocfs2_controld as expected, and displayed the appropriate warning message. This feature requires modification in the userspace ocfs2-tools. The changes can be found at: https://github.com/goldwynr/ocfs2-tools branch: nocontrold Currently, not many checks are present in the userspace code, but that would change soon. This patch (of 6): Add clustername to cluster connection. Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com> Reviewed-by: Mark Fasheh <mfasheh@suse.de> Cc: Joel Becker <jlbec@evilplan.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-01-22 07:48:21 +08:00
const char *cluster_name,
int cluster_name_len,
const char *group,
int grouplen,
struct ocfs2_locking_protocol *lproto,
void (*recovery_handler)(int node_num,
void *recovery_data),
void *recovery_data,
struct ocfs2_cluster_connection **conn);
/*
* Used by callers that don't store their stack name. They must ensure
* all nodes have the same stack.
*/
int ocfs2_cluster_connect_agnostic(const char *group,
int grouplen,
struct ocfs2_locking_protocol *lproto,
void (*recovery_handler)(int node_num,
void *recovery_data),
void *recovery_data,
struct ocfs2_cluster_connection **conn);
int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
int hangup_pending);
void ocfs2_cluster_hangup(const char *group, int grouplen);
int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
unsigned int *node);
struct ocfs2_lock_res;
int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
int mode,
struct ocfs2_dlm_lksb *lksb,
u32 flags,
void *name,
unsigned int namelen);
int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
struct ocfs2_dlm_lksb *lksb,
u32 flags);
int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb);
int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb);
void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb);
void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb);
int ocfs2_stack_supports_plocks(void);
int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
struct file *file, int cmd, struct file_lock *fl);
void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto);
/* Used by stack plugins */
int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
ocfs2: fix crash caused by stale lvb with fsdlm plugin The crash happens rather often when we reset some cluster nodes while nodes contend fiercely to do truncate and append. The crash backtrace is below: dlm: C21CBDA5E0774F4BA5A9D4F317717495: dlm_recover_grant 1 locks on 971 resources dlm: C21CBDA5E0774F4BA5A9D4F317717495: dlm_recover 9 generation 5 done: 4 ms ocfs2: Begin replay journal (node 318952601, slot 2) on device (253,18) ocfs2: End replay journal (node 318952601, slot 2) on device (253,18) ocfs2: Beginning quota recovery on device (253,18) for slot 2 ocfs2: Finishing quota recovery on device (253,18) for slot 2 (truncate,30154,1):ocfs2_truncate_file:470 ERROR: bug expression: le64_to_cpu(fe->i_size) != i_size_read(inode) (truncate,30154,1):ocfs2_truncate_file:470 ERROR: Inode 290321, inode i_size = 732 != di i_size = 937, i_flags = 0x1 ------------[ cut here ]------------ kernel BUG at /usr/src/linux/fs/ocfs2/file.c:470! invalid opcode: 0000 [#1] SMP Modules linked in: ocfs2_stack_user(OEN) ocfs2(OEN) ocfs2_nodemanager ocfs2_stackglue(OEN) quota_tree dlm(OEN) configfs fuse sd_mod iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi af_packet iscsi_ibft iscsi_boot_sysfs softdog xfs libcrc32c ppdev parport_pc pcspkr parport joydev virtio_balloon virtio_net i2c_piix4 acpi_cpufreq button processor ext4 crc16 jbd2 mbcache ata_generic cirrus virtio_blk ata_piix drm_kms_helper ahci syscopyarea libahci sysfillrect sysimgblt fb_sys_fops ttm floppy libata drm virtio_pci virtio_ring uhci_hcd virtio ehci_hcd usbcore serio_raw usb_common sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua scsi_mod autofs4 Supported: No, Unsupported modules are loaded CPU: 1 PID: 30154 Comm: truncate Tainted: G OE N 4.4.21-69-default #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20151112_172657-sheep25 04/01/2014 task: ffff88004ff6d240 ti: ffff880074e68000 task.ti: ffff880074e68000 RIP: 0010:[<ffffffffa05c8c30>] [<ffffffffa05c8c30>] ocfs2_truncate_file+0x640/0x6c0 [ocfs2] RSP: 0018:ffff880074e6bd50 EFLAGS: 00010282 RAX: 0000000000000074 RBX: 000000000000029e RCX: 0000000000000000 RDX: 0000000000000001 RSI: 0000000000000246 RDI: 0000000000000246 RBP: ffff880074e6bda8 R08: 000000003675dc7a R09: ffffffff82013414 R10: 0000000000034c50 R11: 0000000000000000 R12: ffff88003aab3448 R13: 00000000000002dc R14: 0000000000046e11 R15: 0000000000000020 FS: 00007f839f965700(0000) GS:ffff88007fc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007f839f97e000 CR3: 0000000036723000 CR4: 00000000000006e0 Call Trace: ocfs2_setattr+0x698/0xa90 [ocfs2] notify_change+0x1ae/0x380 do_truncate+0x5e/0x90 do_sys_ftruncate.constprop.11+0x108/0x160 entry_SYSCALL_64_fastpath+0x12/0x6d Code: 24 28 ba d6 01 00 00 48 c7 c6 30 43 62 a0 8b 41 2c 89 44 24 08 48 8b 41 20 48 c7 c1 78 a3 62 a0 48 89 04 24 31 c0 e8 a0 97 f9 ff <0f> 0b 3d 00 fe ff ff 0f 84 ab fd ff ff 83 f8 fc 0f 84 a2 fd ff RIP [<ffffffffa05c8c30>] ocfs2_truncate_file+0x640/0x6c0 [ocfs2] It's because ocfs2_inode_lock() get us stale LVB in which the i_size is not equal to the disk i_size. We mistakenly trust the LVB because the underlaying fsdlm dlm_lock() doesn't set lkb_sbflags with DLM_SBF_VALNOTVALID properly for us. But, why? The current code tries to downconvert lock without DLM_LKF_VALBLK flag to tell o2cb don't update RSB's LVB if it's a PR->NULL conversion, even if the lock resource type needs LVB. This is not the right way for fsdlm. The fsdlm plugin behaves different on DLM_LKF_VALBLK, it depends on DLM_LKF_VALBLK to decide if we care about the LVB in the LKB. If DLM_LKF_VALBLK is not set, fsdlm will skip recovering RSB's LVB from this lkb and set the right DLM_SBF_VALNOTVALID appropriately when node failure happens. The following diagram briefly illustrates how this crash happens: RSB1 is inode metadata lock resource with LOCK_TYPE_USES_LVB; The 1st round: Node1 Node2 RSB1: PR RSB1(master): NULL->EX ocfs2_downconvert_lock(PR->NULL, set_lvb==0) ocfs2_dlm_lock(no DLM_LKF_VALBLK) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - dlm_lock(no DLM_LKF_VALBLK) convert_lock(overwrite lkb->lkb_exflags with no DLM_LKF_VALBLK) RSB1: NULL RSB1: EX reset Node2 dlm_recover_rsbs() recover_lvb() /* The LVB is not trustable if the node with EX fails and * no lock >= PR is left. We should set RSB_VALNOTVALID for RSB1. */ if(!(kb_exflags & DLM_LKF_VALBLK)) /* This means we miss the chance to return; * to invalid the LVB here. */ The 2nd round: Node 1 Node2 RSB1(become master from recovery) ocfs2_setattr() ocfs2_inode_lock(NULL->EX) /* dlm_lock() return the stale lvb without setting DLM_SBF_VALNOTVALID */ ocfs2_meta_lvb_is_trustable() return 1 /* so we don't refresh inode from disk */ ocfs2_truncate_file() mlog_bug_on_msg(disk isize != i_size_read(inode)) /* crash! */ The fix is quite straightforward. We keep to set DLM_LKF_VALBLK flag for dlm_lock() if the lock resource type needs LVB and the fsdlm plugin is uesed. Link: http://lkml.kernel.org/r/1481275846-6604-1-git-send-email-zren@suse.com Signed-off-by: Eric Ren <zren@suse.com> Reviewed-by: Joseph Qi <jiangqi903@gmail.com> Cc: Mark Fasheh <mfasheh@versity.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Junxiao Bi <junxiao.bi@oracle.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-01-11 08:57:33 +08:00
/* In ocfs2_downconvert_lock(), we need to know which stack we are using */
int ocfs2_is_o2cb_active(void);
ocfs2: export ocfs2_kset for online file check When there are errors in the ocfs2 filesystem, they are usually accompanied by the inode number which caused the error. This inode number would be the input to fixing the file. One of these options could be considered: A file in the sys filesytem which would accept inode numbers. This could be used to communication back what has to be fixed or is fixed. You could write: $# echo "<inode>" > /sys/fs/ocfs2/devname/filecheck/check or $# echo "<inode>" > /sys/fs/ocfs2/devname/filecheck/fix Compare with second version, I re-design filecheck sysfs interfaces, there are three sysfs files (check, fix and set) under filecheck directory (see above), sysfs will accept only one argument <inode>. Second, I adjust some code in ocfs2_filecheck_repair_inode_block() function according to upstream feedback, we cannot just add VALID_FL flag back as a inode block fix, then we will not fix this field corruption currently until having a complete solution. Compare with first version, I use strncasecmp instead of double strncmp functions. Second, update the source file contribution vendor. This patch (of 4): Export ocfs2_kset object from ocfs2_stackglue kernel module, then online file check code will create the related sysfiles under ocfs2_kset object. We're exporting this because it's built in ocfs2_stackglue.ko. Signed-off-by: Gang He <ghe@suse.com> Reviewed-by: Mark Fasheh <mfasheh@suse.de> Cc: Joel Becker <jlbec@evilplan.org> Cc: Junxiao Bi <junxiao.bi@oracle.com> Cc: Joseph Qi <joseph.qi@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-23 05:24:20 +08:00
extern struct kset *ocfs2_kset;
#endif /* STACKGLUE_H */