2020-05-13 07:54:17 +08:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0 */
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2005-11-02 11:58:39 +08:00
|
|
|
* Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
|
|
|
|
* All Rights Reserved.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
#ifndef __XFS_ATTR_H__
|
|
|
|
#define __XFS_ATTR_H__
|
|
|
|
|
2008-06-23 11:23:48 +08:00
|
|
|
struct xfs_inode;
|
|
|
|
struct xfs_da_args;
|
|
|
|
struct xfs_attr_list_context;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Large attribute lists are structured around Btrees where all the data
|
|
|
|
* elements are in the leaf nodes. Attribute names are hashed into an int,
|
|
|
|
* then that int is used as the index into the Btree. Since the hashval
|
|
|
|
* of an attribute name may not be unique, we may have duplicate keys.
|
|
|
|
* The internal links in the Btree are logical block offsets into the file.
|
|
|
|
*
|
|
|
|
* Small attribute lists use a different format and are packed as tightly
|
|
|
|
* as possible so as to fit into the literal area of the inode.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The maximum size (into the kernel or returned from the kernel) of an
|
|
|
|
* attribute value or the buffer used for an attr_list() call. Larger
|
|
|
|
* sizes will result in an ERANGE return code.
|
|
|
|
*/
|
|
|
|
#define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Kernel-internal version of the attrlist cursor.
|
|
|
|
*/
|
2020-02-27 09:30:43 +08:00
|
|
|
struct xfs_attrlist_cursor_kern {
|
2005-04-17 06:20:36 +08:00
|
|
|
__u32 hashval; /* hash value of next entry to add */
|
|
|
|
__u32 blkno; /* block containing entry (suggestion) */
|
|
|
|
__u32 offset; /* offset in list of equal-hashvals */
|
|
|
|
__u16 pad1; /* padding to match user-level */
|
|
|
|
__u8 pad2; /* padding to match user-level */
|
|
|
|
__u8 initted; /* T/F: cursor has been initialized */
|
2020-02-27 09:30:43 +08:00
|
|
|
};
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
|
|
|
|
/*========================================================================
|
2008-06-23 11:23:48 +08:00
|
|
|
* Structure used to pass context around among the routines.
|
2005-04-17 06:20:36 +08:00
|
|
|
*========================================================================*/
|
|
|
|
|
2008-06-23 11:23:48 +08:00
|
|
|
|
2016-12-05 09:32:14 +08:00
|
|
|
/* void; state communicated via *context */
|
2024-04-23 00:47:53 +08:00
|
|
|
typedef void (*put_listent_func_t)(struct xfs_attr_list_context *context,
|
|
|
|
int flags, unsigned char *name, int namelen, void *value,
|
|
|
|
int valuelen);
|
2008-06-23 11:23:48 +08:00
|
|
|
|
2020-02-27 09:30:37 +08:00
|
|
|
struct xfs_attr_list_context {
|
|
|
|
struct xfs_trans *tp;
|
|
|
|
struct xfs_inode *dp; /* inode */
|
2020-02-27 09:30:43 +08:00
|
|
|
struct xfs_attrlist_cursor_kern cursor; /* position in list */
|
2020-02-27 09:30:37 +08:00
|
|
|
void *buffer; /* output buffer */
|
2019-07-06 01:29:54 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Abort attribute list iteration if non-zero. Can be used to pass
|
|
|
|
* error values to the xfs_attr_list caller.
|
|
|
|
*/
|
2020-02-27 09:30:37 +08:00
|
|
|
int seen_enough;
|
|
|
|
bool allow_incomplete;
|
|
|
|
|
|
|
|
ssize_t count; /* num used entries */
|
|
|
|
int dupcnt; /* count dup hashvals seen */
|
|
|
|
int bufsize; /* total buffer size */
|
|
|
|
int firstu; /* first used byte in buffer */
|
2020-02-27 09:30:42 +08:00
|
|
|
unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE} */
|
2020-02-27 09:30:37 +08:00
|
|
|
int resynch; /* T/F: resynch with cursor */
|
|
|
|
put_listent_func_t put_listent; /* list output fmt function */
|
|
|
|
int index; /* index into output buffer */
|
|
|
|
};
|
2008-06-23 11:23:48 +08:00
|
|
|
|
|
|
|
|
2021-04-27 06:00:33 +08:00
|
|
|
/*
|
|
|
|
* ========================================================================
|
|
|
|
* Structure used to pass context around among the delayed routines.
|
|
|
|
* ========================================================================
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Below is a state machine diagram for attr remove operations. The XFS_DAS_*
|
|
|
|
* states indicate places where the function would return -EAGAIN, and then
|
|
|
|
* immediately resume from after being called by the calling function. States
|
|
|
|
* marked as a "subroutine state" indicate that they belong to a subroutine, and
|
|
|
|
* so the calling function needs to pass them back to that subroutine to allow
|
|
|
|
* it to finish where it left off. But they otherwise do not have a role in the
|
|
|
|
* calling function other than just passing through.
|
|
|
|
*
|
|
|
|
* xfs_attr_remove_iter()
|
|
|
|
* │
|
|
|
|
* v
|
|
|
|
* have attr to remove? ──n──> done
|
|
|
|
* │
|
|
|
|
* y
|
|
|
|
* │
|
|
|
|
* v
|
|
|
|
* are we short form? ──y──> xfs_attr_shortform_remove ──> done
|
|
|
|
* │
|
|
|
|
* n
|
|
|
|
* │
|
|
|
|
* V
|
|
|
|
* are we leaf form? ──y──> xfs_attr_leaf_removename ──> done
|
|
|
|
* │
|
|
|
|
* n
|
|
|
|
* │
|
|
|
|
* V
|
|
|
|
* ┌── need to setup state?
|
|
|
|
* │ │
|
|
|
|
* n y
|
|
|
|
* │ │
|
|
|
|
* │ v
|
|
|
|
* │ find attr and get state
|
|
|
|
* │ attr has remote blks? ──n─┐
|
|
|
|
* │ │ v
|
|
|
|
* │ │ find and invalidate
|
|
|
|
* │ y the remote blocks.
|
|
|
|
* │ │ mark attr incomplete
|
|
|
|
* │ ├────────────────┘
|
|
|
|
* └──────────┤
|
|
|
|
* │
|
|
|
|
* v
|
|
|
|
* Have remote blks to remove? ───y─────┐
|
|
|
|
* │ ^ remove the blks
|
|
|
|
* │ │ │
|
|
|
|
* │ │ v
|
|
|
|
* │ XFS_DAS_RMTBLK <─n── done?
|
|
|
|
* │ re-enter with │
|
|
|
|
* │ one less blk to y
|
|
|
|
* │ remove │
|
|
|
|
* │ V
|
|
|
|
* │ refill the state
|
|
|
|
* n │
|
|
|
|
* │ v
|
|
|
|
* │ XFS_DAS_RM_NAME
|
|
|
|
* │ │
|
|
|
|
* ├─────────────────────────┘
|
|
|
|
* │
|
|
|
|
* v
|
|
|
|
* remove leaf and
|
|
|
|
* update hash with
|
|
|
|
* xfs_attr_node_remove_cleanup
|
|
|
|
* │
|
|
|
|
* v
|
|
|
|
* need to
|
|
|
|
* shrink tree? ─n─┐
|
|
|
|
* │ │
|
|
|
|
* y │
|
|
|
|
* │ │
|
|
|
|
* v │
|
|
|
|
* join leaf │
|
|
|
|
* │ │
|
|
|
|
* v │
|
|
|
|
* XFS_DAS_RM_SHRINK │
|
|
|
|
* │ │
|
|
|
|
* v │
|
|
|
|
* do the shrink │
|
|
|
|
* │ │
|
|
|
|
* v │
|
|
|
|
* free state <──┘
|
|
|
|
* │
|
|
|
|
* v
|
|
|
|
* done
|
|
|
|
*
|
2021-05-22 06:48:13 +08:00
|
|
|
*
|
|
|
|
* Below is a state machine diagram for attr set operations.
|
|
|
|
*
|
|
|
|
* It seems the challenge with understanding this system comes from trying to
|
|
|
|
* absorb the state machine all at once, when really one should only be looking
|
|
|
|
* at it with in the context of a single function. Once a state sensitive
|
|
|
|
* function is called, the idea is that it "takes ownership" of the
|
|
|
|
* state machine. It isn't concerned with the states that may have belonged to
|
|
|
|
* it's calling parent. Only the states relevant to itself or any other
|
|
|
|
* subroutines there in. Once a calling function hands off the state machine to
|
|
|
|
* a subroutine, it needs to respect the simple rule that it doesn't "own" the
|
|
|
|
* state machine anymore, and it's the responsibility of that calling function
|
|
|
|
* to propagate the -EAGAIN back up the call stack. Upon reentry, it is
|
|
|
|
* committed to re-calling that subroutine until it returns something other than
|
|
|
|
* -EAGAIN. Once that subroutine signals completion (by returning anything other
|
|
|
|
* than -EAGAIN), the calling function can resume using the state machine.
|
|
|
|
*
|
|
|
|
* xfs_attr_set_iter()
|
|
|
|
* │
|
|
|
|
* v
|
|
|
|
* ┌─y─ has an attr fork?
|
|
|
|
* │ |
|
|
|
|
* │ n
|
|
|
|
* │ |
|
|
|
|
* │ V
|
|
|
|
* │ add a fork
|
|
|
|
* │ │
|
|
|
|
* └──────────┤
|
|
|
|
* │
|
|
|
|
* V
|
|
|
|
* ┌─── is shortform?
|
|
|
|
* │ │
|
|
|
|
* │ y
|
|
|
|
* │ │
|
|
|
|
* │ V
|
|
|
|
* │ xfs_attr_set_fmt
|
|
|
|
* │ |
|
|
|
|
* │ V
|
|
|
|
* │ xfs_attr_try_sf_addname
|
|
|
|
* │ │
|
|
|
|
* │ V
|
|
|
|
* │ had enough ──y──> done
|
|
|
|
* │ space?
|
|
|
|
* n │
|
|
|
|
* │ n
|
|
|
|
* │ │
|
|
|
|
* │ V
|
|
|
|
* │ transform to leaf
|
|
|
|
* │ │
|
|
|
|
* │ V
|
|
|
|
* │ hold the leaf buffer
|
|
|
|
* │ │
|
|
|
|
* │ V
|
|
|
|
* │ return -EAGAIN
|
|
|
|
* │ Re-enter in
|
|
|
|
* │ leaf form
|
|
|
|
* │
|
|
|
|
* └─> release leaf buffer
|
|
|
|
* if needed
|
|
|
|
* │
|
|
|
|
* V
|
|
|
|
* ┌───n── fork has
|
|
|
|
* │ only 1 blk?
|
|
|
|
* │ │
|
|
|
|
* │ y
|
|
|
|
* │ │
|
|
|
|
* │ v
|
|
|
|
* │ xfs_attr_leaf_try_add()
|
|
|
|
* │ │
|
|
|
|
* │ v
|
|
|
|
* │ had enough ──────────────y─────────────┐
|
|
|
|
* │ space? │
|
|
|
|
* │ │ │
|
|
|
|
* │ n │
|
|
|
|
* │ │ │
|
|
|
|
* │ v │
|
|
|
|
* │ return -EAGAIN │
|
|
|
|
* │ re-enter in │
|
|
|
|
* │ node form │
|
|
|
|
* │ │ │
|
|
|
|
* └──────────┤ │
|
|
|
|
* │ │
|
|
|
|
* V │
|
|
|
|
* xfs_attr_node_addname_find_attr │
|
|
|
|
* determines if this │
|
|
|
|
* is create or rename │
|
|
|
|
* find space to store attr │
|
|
|
|
* │ │
|
|
|
|
* v │
|
|
|
|
* xfs_attr_node_addname │
|
|
|
|
* │ │
|
|
|
|
* v │
|
|
|
|
* fits in a node leaf? ────n─────┐ │
|
|
|
|
* │ ^ v │
|
|
|
|
* │ │ single leaf node? │
|
|
|
|
* │ │ │ │ │
|
|
|
|
* y │ y n │
|
|
|
|
* │ │ │ │ │
|
|
|
|
* v │ v v │
|
|
|
|
* update │ grow the leaf split if │
|
|
|
|
* hashvals └── return -EAGAIN needed │
|
|
|
|
* │ retry leaf add │ │
|
|
|
|
* │ on reentry │ │
|
|
|
|
* ├────────────────────────────┘ │
|
|
|
|
* │ │
|
|
|
|
* v │
|
|
|
|
* need to alloc │
|
|
|
|
* ┌─y── or flip flag? │
|
|
|
|
* │ │ │
|
|
|
|
* │ n │
|
|
|
|
* │ │ │
|
|
|
|
* │ v │
|
|
|
|
* │ done │
|
|
|
|
* │ │
|
|
|
|
* │ │
|
|
|
|
* │ XFS_DAS_FOUND_LBLK <────────────────┘
|
|
|
|
* │ │
|
|
|
|
* │ V
|
|
|
|
* │ xfs_attr_leaf_addname()
|
|
|
|
* │ │
|
|
|
|
* │ v
|
|
|
|
* │ ┌──first time through?
|
|
|
|
* │ │ │
|
|
|
|
* │ │ y
|
|
|
|
* │ │ │
|
|
|
|
* │ n v
|
|
|
|
* │ │ if we have rmt blks
|
|
|
|
* │ │ find space for them
|
|
|
|
* │ │ │
|
|
|
|
* │ └──────────┤
|
|
|
|
* │ │
|
|
|
|
* │ v
|
|
|
|
* │ still have
|
|
|
|
* │ ┌─n─ blks to alloc? <──┐
|
|
|
|
* │ │ │ │
|
|
|
|
* │ │ y │
|
|
|
|
* │ │ │ │
|
|
|
|
* │ │ v │
|
|
|
|
* │ │ alloc one blk │
|
|
|
|
* │ │ return -EAGAIN ──┘
|
|
|
|
* │ │ re-enter with one
|
|
|
|
* │ │ less blk to alloc
|
|
|
|
* │ │
|
|
|
|
* │ │
|
|
|
|
* │ └───> set the rmt
|
|
|
|
* │ value
|
|
|
|
* │ │
|
|
|
|
* │ v
|
|
|
|
* │ was this
|
|
|
|
* │ a rename? ──n─┐
|
|
|
|
* │ │ │
|
|
|
|
* │ y │
|
|
|
|
* │ │ │
|
|
|
|
* │ v │
|
|
|
|
* │ flip incomplete │
|
|
|
|
* │ flag │
|
|
|
|
* │ │ │
|
|
|
|
* │ v │
|
|
|
|
* │ XFS_DAS_FLIP_LFLAG │
|
|
|
|
* │ │ │
|
|
|
|
* │ v │
|
|
|
|
* │ need to remove │
|
|
|
|
* │ old bks? ──n──┤
|
|
|
|
* │ │ │
|
|
|
|
* │ y │
|
|
|
|
* │ │ │
|
|
|
|
* │ V │
|
|
|
|
* │ remove │
|
|
|
|
* │ ┌───> old blks │
|
|
|
|
* │ │ │ │
|
|
|
|
* │ XFS_DAS_RM_LBLK │ │
|
|
|
|
* │ ^ │ │
|
|
|
|
* │ │ v │
|
|
|
|
* │ └──y── more to │
|
|
|
|
* │ remove? │
|
|
|
|
* │ │ │
|
|
|
|
* │ n │
|
|
|
|
* │ │ │
|
|
|
|
* │ v │
|
|
|
|
* │ XFS_DAS_RD_LEAF │
|
|
|
|
* │ │ │
|
|
|
|
* │ v │
|
|
|
|
* │ remove leaf │
|
|
|
|
* │ │ │
|
|
|
|
* │ v │
|
|
|
|
* │ shrink to sf │
|
|
|
|
* │ if needed │
|
|
|
|
* │ │ │
|
|
|
|
* │ v │
|
|
|
|
* │ done <──────┘
|
|
|
|
* │
|
|
|
|
* └──────> XFS_DAS_FOUND_NBLK
|
|
|
|
* │
|
|
|
|
* v
|
|
|
|
* ┌─────n── need to
|
|
|
|
* │ alloc blks?
|
|
|
|
* │ │
|
|
|
|
* │ y
|
|
|
|
* │ │
|
|
|
|
* │ v
|
|
|
|
* │ find space
|
|
|
|
* │ │
|
|
|
|
* │ v
|
|
|
|
* │ ┌─>XFS_DAS_ALLOC_NODE
|
|
|
|
* │ │ │
|
|
|
|
* │ │ v
|
|
|
|
* │ │ alloc blk
|
|
|
|
* │ │ │
|
|
|
|
* │ │ v
|
|
|
|
* │ └──y── need to alloc
|
|
|
|
* │ more blocks?
|
|
|
|
* │ │
|
|
|
|
* │ n
|
|
|
|
* │ │
|
|
|
|
* │ v
|
|
|
|
* │ set the rmt value
|
|
|
|
* │ │
|
|
|
|
* │ v
|
|
|
|
* │ was this
|
|
|
|
* └────────> a rename? ──n─┐
|
|
|
|
* │ │
|
|
|
|
* y │
|
|
|
|
* │ │
|
|
|
|
* v │
|
|
|
|
* flip incomplete │
|
|
|
|
* flag │
|
|
|
|
* │ │
|
|
|
|
* v │
|
|
|
|
* XFS_DAS_FLIP_NFLAG │
|
|
|
|
* │ │
|
|
|
|
* v │
|
|
|
|
* need to │
|
|
|
|
* remove blks? ─n──┤
|
|
|
|
* │ │
|
|
|
|
* y │
|
|
|
|
* │ │
|
|
|
|
* v │
|
|
|
|
* remove │
|
|
|
|
* ┌────────> old blks │
|
|
|
|
* │ │ │
|
|
|
|
* XFS_DAS_RM_NBLK │ │
|
|
|
|
* ^ │ │
|
|
|
|
* │ v │
|
|
|
|
* └──────y── more to │
|
|
|
|
* remove │
|
|
|
|
* │ │
|
|
|
|
* n │
|
|
|
|
* │ │
|
|
|
|
* v │
|
|
|
|
* XFS_DAS_CLR_FLAG │
|
|
|
|
* │ │
|
|
|
|
* v │
|
|
|
|
* clear flags │
|
|
|
|
* │ │
|
|
|
|
* ├──────────┘
|
|
|
|
* │
|
|
|
|
* v
|
|
|
|
* done
|
2021-04-27 06:00:33 +08:00
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
2022-05-22 14:00:26 +08:00
|
|
|
* Enum values for xfs_attr_intent.xattri_da_state
|
2021-04-27 06:00:33 +08:00
|
|
|
*
|
|
|
|
* These values are used by delayed attribute operations to keep track of where
|
|
|
|
* they were before they returned -EAGAIN. A return code of -EAGAIN signals the
|
|
|
|
* calling function to roll the transaction, and then call the subroutine to
|
|
|
|
* finish the operation. The enum is then used by the subroutine to jump back
|
|
|
|
* to where it was and resume executing where it left off.
|
|
|
|
*/
|
|
|
|
enum xfs_delattr_state {
|
xfs: separate out initial attr_set states
We current use XFS_DAS_UNINIT for several steps in the attr_set
state machine. We use it for setting shortform xattrs, converting
from shortform to leaf, leaf add, leaf-to-node and leaf add. All of
these things are essentially known before we start the state machine
iterating, so we really should separate them out:
XFS_DAS_SF_ADD:
- tries to do a shortform add
- on success -> done
- on ENOSPC converts to leaf, -> XFS_DAS_LEAF_ADD
- on error, dies.
XFS_DAS_LEAF_ADD:
- tries to do leaf add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_LBLK
- on ENOSPC converts to node, -> XFS_DAS_NODE_ADD
- on error, dies
XFS_DAS_NODE_ADD:
- tries to do node add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_NBLK
- on error, dies
This makes it easier to understand how the state machine starts
up and sets us up on the path to further state machine
simplifications.
This also converts the DAS state tracepoints to use strings rather
than numbers, as converting between enums and numbers requires
manual counting rather than just reading the name.
This also introduces a XFS_DAS_DONE state so that we can trace
successful operation completions easily.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson<allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 13:12:52 +08:00
|
|
|
XFS_DAS_UNINIT = 0, /* No state has been set yet */
|
xfs: ATTR_REPLACE algorithm with LARP enabled needs rework
We can't use the same algorithm for replacing an existing attribute
when logging attributes. The existing algorithm is essentially:
1. create new attr w/ INCOMPLETE
2. atomically flip INCOMPLETE flags between old + new attribute
3. remove old attr which is marked w/ INCOMPLETE
This algorithm guarantees that we see either the old or new
attribute, and if we fail after the atomic flag flip, we don't have
to recover the removal of the old attr because we never see
INCOMPLETE attributes in lookups.
For logged attributes, however, this does not work. The logged
attribute intents do not track the work that has been done as the
transaction rolls, and hence the only recovery mechanism we have is
"run the replace operation from scratch".
This is further exacerbated by the attempt to avoid needing the
INCOMPLETE flag to create an atomic swap. This means we can create
a second active attribute of the same name before we remove the
original. If we fail at any point after the create but before the
removal has completed, we end up with duplicate attributes in
the attr btree and recovery only tries to replace one of them.
There are several other failure modes where we can leave partially
allocated remote attributes that expose stale data, partially free
remote attributes that enable UAF based stale data exposure, etc.
TO fix this, we need a different algorithm for replace operations
when LARP is enabled. Luckily, it's not that complex if we take the
right first step. That is, the first thing we log is the attri
intent with the new name/value pair and mark the old attr as
INCOMPLETE in the same transaction.
From there, we then remove the old attr and keep relogging the
new name/value in the intent, such that we always know that we have
to create the new attr in recovery. Once the old attr is removed,
we then run a normal ATTR_CREATE operation relogging the intent as
we go. If the new attr is local, then it gets created in a single
atomic transaction that also logs the final intent done. If the new
attr is remote, the we set INCOMPLETE on the new attr while we
allocate and set the remote value, and then we clear the INCOMPLETE
flag at in the last transaction taht logs the final intent done.
If we fail at any point in this algorithm, log recovery will always
see the same state on disk: the new name/value in the intent, and
either an INCOMPLETE attr or no attr in the attr btree. If we find
an INCOMPLETE attr, we run the full replace starting with removing
the INCOMPLETE attr. If we don't find it, then we simply create the
new attr.
Notably, recovery of a failed create that has an INCOMPLETE flag set
is now the same - we start with the lookup of the INCOMPLETE attr,
and if that exists then we do the full replace recovery process,
otherwise we just create the new attr.
Hence changing the way we do the replace operation when LARP is
enabled allows us to use the same log recovery algorithm for both
the ATTR_CREATE and ATTR_REPLACE operations. This is also the same
algorithm we use for runtime ATTR_REPLACE operations (except for the
step setting up the initial conditions).
The result is that:
- ATTR_CREATE uses the same algorithm regardless of whether LARP is
enabled or not
- ATTR_REPLACE with larp=0 is identical to the old algorithm
- ATTR_REPLACE with larp=1 runs an unmodified attr removal algorithm
from the larp=0 code and then runs the unmodified ATTR_CREATE
code.
- log recovery when larp=1 runs the same ATTR_REPLACE algorithm as
it uses at runtime.
Because the state machine is now quite clean, changing the algorithm
is really just a case of changing the initial state and how the
states link together for the ATTR_REPLACE case. Hence it's not a
huge amount of code for what is a fairly substantial rework
of the attr logging and recovery algorithm....
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 13:12:56 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Initial sequence states. The replace setup code relies on the
|
|
|
|
* ADD and REMOVE states for a specific format to be sequential so
|
|
|
|
* that we can transform the initial operation to be performed
|
|
|
|
* according to the xfs_has_larp() state easily.
|
|
|
|
*/
|
|
|
|
XFS_DAS_SF_ADD, /* Initial sf add state */
|
|
|
|
XFS_DAS_SF_REMOVE, /* Initial sf replace/remove state */
|
|
|
|
|
|
|
|
XFS_DAS_LEAF_ADD, /* Initial leaf add state */
|
|
|
|
XFS_DAS_LEAF_REMOVE, /* Initial leaf replace/remove state */
|
|
|
|
|
|
|
|
XFS_DAS_NODE_ADD, /* Initial node add state */
|
|
|
|
XFS_DAS_NODE_REMOVE, /* Initial node replace/remove state */
|
|
|
|
|
|
|
|
/* Leaf state set/replace/remove sequence */
|
2022-05-12 13:12:55 +08:00
|
|
|
XFS_DAS_LEAF_SET_RMT, /* set a remote xattr from a leaf */
|
2022-05-12 13:12:54 +08:00
|
|
|
XFS_DAS_LEAF_ALLOC_RMT, /* We are allocating remote blocks */
|
2022-05-12 13:12:55 +08:00
|
|
|
XFS_DAS_LEAF_REPLACE, /* Perform replace ops on a leaf */
|
2022-05-12 13:12:55 +08:00
|
|
|
XFS_DAS_LEAF_REMOVE_OLD, /* Start removing old attr from leaf */
|
2022-05-12 13:12:55 +08:00
|
|
|
XFS_DAS_LEAF_REMOVE_RMT, /* A rename is removing remote blocks */
|
2022-05-12 13:12:55 +08:00
|
|
|
XFS_DAS_LEAF_REMOVE_ATTR, /* Remove the old attr from a leaf */
|
2022-05-12 13:12:54 +08:00
|
|
|
|
xfs: ATTR_REPLACE algorithm with LARP enabled needs rework
We can't use the same algorithm for replacing an existing attribute
when logging attributes. The existing algorithm is essentially:
1. create new attr w/ INCOMPLETE
2. atomically flip INCOMPLETE flags between old + new attribute
3. remove old attr which is marked w/ INCOMPLETE
This algorithm guarantees that we see either the old or new
attribute, and if we fail after the atomic flag flip, we don't have
to recover the removal of the old attr because we never see
INCOMPLETE attributes in lookups.
For logged attributes, however, this does not work. The logged
attribute intents do not track the work that has been done as the
transaction rolls, and hence the only recovery mechanism we have is
"run the replace operation from scratch".
This is further exacerbated by the attempt to avoid needing the
INCOMPLETE flag to create an atomic swap. This means we can create
a second active attribute of the same name before we remove the
original. If we fail at any point after the create but before the
removal has completed, we end up with duplicate attributes in
the attr btree and recovery only tries to replace one of them.
There are several other failure modes where we can leave partially
allocated remote attributes that expose stale data, partially free
remote attributes that enable UAF based stale data exposure, etc.
TO fix this, we need a different algorithm for replace operations
when LARP is enabled. Luckily, it's not that complex if we take the
right first step. That is, the first thing we log is the attri
intent with the new name/value pair and mark the old attr as
INCOMPLETE in the same transaction.
From there, we then remove the old attr and keep relogging the
new name/value in the intent, such that we always know that we have
to create the new attr in recovery. Once the old attr is removed,
we then run a normal ATTR_CREATE operation relogging the intent as
we go. If the new attr is local, then it gets created in a single
atomic transaction that also logs the final intent done. If the new
attr is remote, the we set INCOMPLETE on the new attr while we
allocate and set the remote value, and then we clear the INCOMPLETE
flag at in the last transaction taht logs the final intent done.
If we fail at any point in this algorithm, log recovery will always
see the same state on disk: the new name/value in the intent, and
either an INCOMPLETE attr or no attr in the attr btree. If we find
an INCOMPLETE attr, we run the full replace starting with removing
the INCOMPLETE attr. If we don't find it, then we simply create the
new attr.
Notably, recovery of a failed create that has an INCOMPLETE flag set
is now the same - we start with the lookup of the INCOMPLETE attr,
and if that exists then we do the full replace recovery process,
otherwise we just create the new attr.
Hence changing the way we do the replace operation when LARP is
enabled allows us to use the same log recovery algorithm for both
the ATTR_CREATE and ATTR_REPLACE operations. This is also the same
algorithm we use for runtime ATTR_REPLACE operations (except for the
step setting up the initial conditions).
The result is that:
- ATTR_CREATE uses the same algorithm regardless of whether LARP is
enabled or not
- ATTR_REPLACE with larp=0 is identical to the old algorithm
- ATTR_REPLACE with larp=1 runs an unmodified attr removal algorithm
from the larp=0 code and then runs the unmodified ATTR_CREATE
code.
- log recovery when larp=1 runs the same ATTR_REPLACE algorithm as
it uses at runtime.
Because the state machine is now quite clean, changing the algorithm
is really just a case of changing the initial state and how the
states link together for the ATTR_REPLACE case. Hence it's not a
huge amount of code for what is a fairly substantial rework
of the attr logging and recovery algorithm....
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 13:12:56 +08:00
|
|
|
/* Node state sequence, must match leaf state above */
|
2022-05-12 13:12:55 +08:00
|
|
|
XFS_DAS_NODE_SET_RMT, /* set a remote xattr from a node */
|
2022-05-12 13:12:54 +08:00
|
|
|
XFS_DAS_NODE_ALLOC_RMT, /* We are allocating remote blocks */
|
2022-05-12 13:12:55 +08:00
|
|
|
XFS_DAS_NODE_REPLACE, /* Perform replace ops on a node */
|
2022-05-12 13:12:55 +08:00
|
|
|
XFS_DAS_NODE_REMOVE_OLD, /* Start removing old attr from node */
|
2022-05-12 13:12:55 +08:00
|
|
|
XFS_DAS_NODE_REMOVE_RMT, /* A rename is removing remote blocks */
|
2022-05-12 13:12:55 +08:00
|
|
|
XFS_DAS_NODE_REMOVE_ATTR, /* Remove the old attr from a node */
|
2022-05-12 13:12:54 +08:00
|
|
|
|
xfs: separate out initial attr_set states
We current use XFS_DAS_UNINIT for several steps in the attr_set
state machine. We use it for setting shortform xattrs, converting
from shortform to leaf, leaf add, leaf-to-node and leaf add. All of
these things are essentially known before we start the state machine
iterating, so we really should separate them out:
XFS_DAS_SF_ADD:
- tries to do a shortform add
- on success -> done
- on ENOSPC converts to leaf, -> XFS_DAS_LEAF_ADD
- on error, dies.
XFS_DAS_LEAF_ADD:
- tries to do leaf add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_LBLK
- on ENOSPC converts to node, -> XFS_DAS_NODE_ADD
- on error, dies
XFS_DAS_NODE_ADD:
- tries to do node add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_NBLK
- on error, dies
This makes it easier to understand how the state machine starts
up and sets us up on the path to further state machine
simplifications.
This also converts the DAS state tracepoints to use strings rather
than numbers, as converting between enums and numbers requires
manual counting rather than just reading the name.
This also introduces a XFS_DAS_DONE state so that we can trace
successful operation completions easily.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson<allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 13:12:52 +08:00
|
|
|
XFS_DAS_DONE, /* finished operation */
|
2021-04-27 06:00:33 +08:00
|
|
|
};
|
|
|
|
|
xfs: separate out initial attr_set states
We current use XFS_DAS_UNINIT for several steps in the attr_set
state machine. We use it for setting shortform xattrs, converting
from shortform to leaf, leaf add, leaf-to-node and leaf add. All of
these things are essentially known before we start the state machine
iterating, so we really should separate them out:
XFS_DAS_SF_ADD:
- tries to do a shortform add
- on success -> done
- on ENOSPC converts to leaf, -> XFS_DAS_LEAF_ADD
- on error, dies.
XFS_DAS_LEAF_ADD:
- tries to do leaf add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_LBLK
- on ENOSPC converts to node, -> XFS_DAS_NODE_ADD
- on error, dies
XFS_DAS_NODE_ADD:
- tries to do node add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_NBLK
- on error, dies
This makes it easier to understand how the state machine starts
up and sets us up on the path to further state machine
simplifications.
This also converts the DAS state tracepoints to use strings rather
than numbers, as converting between enums and numbers requires
manual counting rather than just reading the name.
This also introduces a XFS_DAS_DONE state so that we can trace
successful operation completions easily.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson<allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 13:12:52 +08:00
|
|
|
#define XFS_DAS_STRINGS \
|
2022-05-12 13:12:55 +08:00
|
|
|
{ XFS_DAS_UNINIT, "XFS_DAS_UNINIT" }, \
|
|
|
|
{ XFS_DAS_SF_ADD, "XFS_DAS_SF_ADD" }, \
|
xfs: ATTR_REPLACE algorithm with LARP enabled needs rework
We can't use the same algorithm for replacing an existing attribute
when logging attributes. The existing algorithm is essentially:
1. create new attr w/ INCOMPLETE
2. atomically flip INCOMPLETE flags between old + new attribute
3. remove old attr which is marked w/ INCOMPLETE
This algorithm guarantees that we see either the old or new
attribute, and if we fail after the atomic flag flip, we don't have
to recover the removal of the old attr because we never see
INCOMPLETE attributes in lookups.
For logged attributes, however, this does not work. The logged
attribute intents do not track the work that has been done as the
transaction rolls, and hence the only recovery mechanism we have is
"run the replace operation from scratch".
This is further exacerbated by the attempt to avoid needing the
INCOMPLETE flag to create an atomic swap. This means we can create
a second active attribute of the same name before we remove the
original. If we fail at any point after the create but before the
removal has completed, we end up with duplicate attributes in
the attr btree and recovery only tries to replace one of them.
There are several other failure modes where we can leave partially
allocated remote attributes that expose stale data, partially free
remote attributes that enable UAF based stale data exposure, etc.
TO fix this, we need a different algorithm for replace operations
when LARP is enabled. Luckily, it's not that complex if we take the
right first step. That is, the first thing we log is the attri
intent with the new name/value pair and mark the old attr as
INCOMPLETE in the same transaction.
From there, we then remove the old attr and keep relogging the
new name/value in the intent, such that we always know that we have
to create the new attr in recovery. Once the old attr is removed,
we then run a normal ATTR_CREATE operation relogging the intent as
we go. If the new attr is local, then it gets created in a single
atomic transaction that also logs the final intent done. If the new
attr is remote, the we set INCOMPLETE on the new attr while we
allocate and set the remote value, and then we clear the INCOMPLETE
flag at in the last transaction taht logs the final intent done.
If we fail at any point in this algorithm, log recovery will always
see the same state on disk: the new name/value in the intent, and
either an INCOMPLETE attr or no attr in the attr btree. If we find
an INCOMPLETE attr, we run the full replace starting with removing
the INCOMPLETE attr. If we don't find it, then we simply create the
new attr.
Notably, recovery of a failed create that has an INCOMPLETE flag set
is now the same - we start with the lookup of the INCOMPLETE attr,
and if that exists then we do the full replace recovery process,
otherwise we just create the new attr.
Hence changing the way we do the replace operation when LARP is
enabled allows us to use the same log recovery algorithm for both
the ATTR_CREATE and ATTR_REPLACE operations. This is also the same
algorithm we use for runtime ATTR_REPLACE operations (except for the
step setting up the initial conditions).
The result is that:
- ATTR_CREATE uses the same algorithm regardless of whether LARP is
enabled or not
- ATTR_REPLACE with larp=0 is identical to the old algorithm
- ATTR_REPLACE with larp=1 runs an unmodified attr removal algorithm
from the larp=0 code and then runs the unmodified ATTR_CREATE
code.
- log recovery when larp=1 runs the same ATTR_REPLACE algorithm as
it uses at runtime.
Because the state machine is now quite clean, changing the algorithm
is really just a case of changing the initial state and how the
states link together for the ATTR_REPLACE case. Hence it's not a
huge amount of code for what is a fairly substantial rework
of the attr logging and recovery algorithm....
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 13:12:56 +08:00
|
|
|
{ XFS_DAS_SF_REMOVE, "XFS_DAS_SF_REMOVE" }, \
|
2022-05-12 13:12:55 +08:00
|
|
|
{ XFS_DAS_LEAF_ADD, "XFS_DAS_LEAF_ADD" }, \
|
xfs: ATTR_REPLACE algorithm with LARP enabled needs rework
We can't use the same algorithm for replacing an existing attribute
when logging attributes. The existing algorithm is essentially:
1. create new attr w/ INCOMPLETE
2. atomically flip INCOMPLETE flags between old + new attribute
3. remove old attr which is marked w/ INCOMPLETE
This algorithm guarantees that we see either the old or new
attribute, and if we fail after the atomic flag flip, we don't have
to recover the removal of the old attr because we never see
INCOMPLETE attributes in lookups.
For logged attributes, however, this does not work. The logged
attribute intents do not track the work that has been done as the
transaction rolls, and hence the only recovery mechanism we have is
"run the replace operation from scratch".
This is further exacerbated by the attempt to avoid needing the
INCOMPLETE flag to create an atomic swap. This means we can create
a second active attribute of the same name before we remove the
original. If we fail at any point after the create but before the
removal has completed, we end up with duplicate attributes in
the attr btree and recovery only tries to replace one of them.
There are several other failure modes where we can leave partially
allocated remote attributes that expose stale data, partially free
remote attributes that enable UAF based stale data exposure, etc.
TO fix this, we need a different algorithm for replace operations
when LARP is enabled. Luckily, it's not that complex if we take the
right first step. That is, the first thing we log is the attri
intent with the new name/value pair and mark the old attr as
INCOMPLETE in the same transaction.
From there, we then remove the old attr and keep relogging the
new name/value in the intent, such that we always know that we have
to create the new attr in recovery. Once the old attr is removed,
we then run a normal ATTR_CREATE operation relogging the intent as
we go. If the new attr is local, then it gets created in a single
atomic transaction that also logs the final intent done. If the new
attr is remote, the we set INCOMPLETE on the new attr while we
allocate and set the remote value, and then we clear the INCOMPLETE
flag at in the last transaction taht logs the final intent done.
If we fail at any point in this algorithm, log recovery will always
see the same state on disk: the new name/value in the intent, and
either an INCOMPLETE attr or no attr in the attr btree. If we find
an INCOMPLETE attr, we run the full replace starting with removing
the INCOMPLETE attr. If we don't find it, then we simply create the
new attr.
Notably, recovery of a failed create that has an INCOMPLETE flag set
is now the same - we start with the lookup of the INCOMPLETE attr,
and if that exists then we do the full replace recovery process,
otherwise we just create the new attr.
Hence changing the way we do the replace operation when LARP is
enabled allows us to use the same log recovery algorithm for both
the ATTR_CREATE and ATTR_REPLACE operations. This is also the same
algorithm we use for runtime ATTR_REPLACE operations (except for the
step setting up the initial conditions).
The result is that:
- ATTR_CREATE uses the same algorithm regardless of whether LARP is
enabled or not
- ATTR_REPLACE with larp=0 is identical to the old algorithm
- ATTR_REPLACE with larp=1 runs an unmodified attr removal algorithm
from the larp=0 code and then runs the unmodified ATTR_CREATE
code.
- log recovery when larp=1 runs the same ATTR_REPLACE algorithm as
it uses at runtime.
Because the state machine is now quite clean, changing the algorithm
is really just a case of changing the initial state and how the
states link together for the ATTR_REPLACE case. Hence it's not a
huge amount of code for what is a fairly substantial rework
of the attr logging and recovery algorithm....
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 13:12:56 +08:00
|
|
|
{ XFS_DAS_LEAF_REMOVE, "XFS_DAS_LEAF_REMOVE" }, \
|
2022-05-12 13:12:55 +08:00
|
|
|
{ XFS_DAS_NODE_ADD, "XFS_DAS_NODE_ADD" }, \
|
xfs: ATTR_REPLACE algorithm with LARP enabled needs rework
We can't use the same algorithm for replacing an existing attribute
when logging attributes. The existing algorithm is essentially:
1. create new attr w/ INCOMPLETE
2. atomically flip INCOMPLETE flags between old + new attribute
3. remove old attr which is marked w/ INCOMPLETE
This algorithm guarantees that we see either the old or new
attribute, and if we fail after the atomic flag flip, we don't have
to recover the removal of the old attr because we never see
INCOMPLETE attributes in lookups.
For logged attributes, however, this does not work. The logged
attribute intents do not track the work that has been done as the
transaction rolls, and hence the only recovery mechanism we have is
"run the replace operation from scratch".
This is further exacerbated by the attempt to avoid needing the
INCOMPLETE flag to create an atomic swap. This means we can create
a second active attribute of the same name before we remove the
original. If we fail at any point after the create but before the
removal has completed, we end up with duplicate attributes in
the attr btree and recovery only tries to replace one of them.
There are several other failure modes where we can leave partially
allocated remote attributes that expose stale data, partially free
remote attributes that enable UAF based stale data exposure, etc.
TO fix this, we need a different algorithm for replace operations
when LARP is enabled. Luckily, it's not that complex if we take the
right first step. That is, the first thing we log is the attri
intent with the new name/value pair and mark the old attr as
INCOMPLETE in the same transaction.
From there, we then remove the old attr and keep relogging the
new name/value in the intent, such that we always know that we have
to create the new attr in recovery. Once the old attr is removed,
we then run a normal ATTR_CREATE operation relogging the intent as
we go. If the new attr is local, then it gets created in a single
atomic transaction that also logs the final intent done. If the new
attr is remote, the we set INCOMPLETE on the new attr while we
allocate and set the remote value, and then we clear the INCOMPLETE
flag at in the last transaction taht logs the final intent done.
If we fail at any point in this algorithm, log recovery will always
see the same state on disk: the new name/value in the intent, and
either an INCOMPLETE attr or no attr in the attr btree. If we find
an INCOMPLETE attr, we run the full replace starting with removing
the INCOMPLETE attr. If we don't find it, then we simply create the
new attr.
Notably, recovery of a failed create that has an INCOMPLETE flag set
is now the same - we start with the lookup of the INCOMPLETE attr,
and if that exists then we do the full replace recovery process,
otherwise we just create the new attr.
Hence changing the way we do the replace operation when LARP is
enabled allows us to use the same log recovery algorithm for both
the ATTR_CREATE and ATTR_REPLACE operations. This is also the same
algorithm we use for runtime ATTR_REPLACE operations (except for the
step setting up the initial conditions).
The result is that:
- ATTR_CREATE uses the same algorithm regardless of whether LARP is
enabled or not
- ATTR_REPLACE with larp=0 is identical to the old algorithm
- ATTR_REPLACE with larp=1 runs an unmodified attr removal algorithm
from the larp=0 code and then runs the unmodified ATTR_CREATE
code.
- log recovery when larp=1 runs the same ATTR_REPLACE algorithm as
it uses at runtime.
Because the state machine is now quite clean, changing the algorithm
is really just a case of changing the initial state and how the
states link together for the ATTR_REPLACE case. Hence it's not a
huge amount of code for what is a fairly substantial rework
of the attr logging and recovery algorithm....
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 13:12:56 +08:00
|
|
|
{ XFS_DAS_NODE_REMOVE, "XFS_DAS_NODE_REMOVE" }, \
|
2022-05-12 13:12:55 +08:00
|
|
|
{ XFS_DAS_LEAF_SET_RMT, "XFS_DAS_LEAF_SET_RMT" }, \
|
|
|
|
{ XFS_DAS_LEAF_ALLOC_RMT, "XFS_DAS_LEAF_ALLOC_RMT" }, \
|
|
|
|
{ XFS_DAS_LEAF_REPLACE, "XFS_DAS_LEAF_REPLACE" }, \
|
|
|
|
{ XFS_DAS_LEAF_REMOVE_OLD, "XFS_DAS_LEAF_REMOVE_OLD" }, \
|
2022-05-12 13:12:55 +08:00
|
|
|
{ XFS_DAS_LEAF_REMOVE_RMT, "XFS_DAS_LEAF_REMOVE_RMT" }, \
|
2022-05-12 13:12:55 +08:00
|
|
|
{ XFS_DAS_LEAF_REMOVE_ATTR, "XFS_DAS_LEAF_REMOVE_ATTR" }, \
|
2022-05-12 13:12:55 +08:00
|
|
|
{ XFS_DAS_NODE_SET_RMT, "XFS_DAS_NODE_SET_RMT" }, \
|
|
|
|
{ XFS_DAS_NODE_ALLOC_RMT, "XFS_DAS_NODE_ALLOC_RMT" }, \
|
|
|
|
{ XFS_DAS_NODE_REPLACE, "XFS_DAS_NODE_REPLACE" }, \
|
|
|
|
{ XFS_DAS_NODE_REMOVE_OLD, "XFS_DAS_NODE_REMOVE_OLD" }, \
|
2022-05-12 13:12:55 +08:00
|
|
|
{ XFS_DAS_NODE_REMOVE_RMT, "XFS_DAS_NODE_REMOVE_RMT" }, \
|
2022-05-12 13:12:55 +08:00
|
|
|
{ XFS_DAS_NODE_REMOVE_ATTR, "XFS_DAS_NODE_REMOVE_ATTR" }, \
|
2022-05-12 13:12:55 +08:00
|
|
|
{ XFS_DAS_DONE, "XFS_DAS_DONE" }
|
xfs: separate out initial attr_set states
We current use XFS_DAS_UNINIT for several steps in the attr_set
state machine. We use it for setting shortform xattrs, converting
from shortform to leaf, leaf add, leaf-to-node and leaf add. All of
these things are essentially known before we start the state machine
iterating, so we really should separate them out:
XFS_DAS_SF_ADD:
- tries to do a shortform add
- on success -> done
- on ENOSPC converts to leaf, -> XFS_DAS_LEAF_ADD
- on error, dies.
XFS_DAS_LEAF_ADD:
- tries to do leaf add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_LBLK
- on ENOSPC converts to node, -> XFS_DAS_NODE_ADD
- on error, dies
XFS_DAS_NODE_ADD:
- tries to do node add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_NBLK
- on error, dies
This makes it easier to understand how the state machine starts
up and sets us up on the path to further state machine
simplifications.
This also converts the DAS state tracepoints to use strings rather
than numbers, as converting between enums and numbers requires
manual counting rather than just reading the name.
This also introduces a XFS_DAS_DONE state so that we can trace
successful operation completions easily.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson<allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 13:12:52 +08:00
|
|
|
|
xfs: share xattr name and value buffers when logging xattr updates
While running xfs/297 and generic/642, I noticed a crash in
xfs_attri_item_relog when it tries to copy the attr name to the new
xattri log item. I think what happened here was that we called
->iop_commit on the old attri item (which nulls out the pointers) as
part of a log force at the same time that a chained attr operation was
ongoing. The system was busy enough that at some later point, the defer
ops operation decided it was necessary to relog the attri log item, but
as we've detached the name buffer from the old attri log item, we can't
copy it to the new one, and kaboom.
I think there's a broader refcounting problem with LARP mode -- the
setxattr code can return to userspace before the CIL actually formats
and commits the log item, which results in a UAF bug. Therefore, the
xattr log item needs to be able to retain a reference to the name and
value buffers until the log items have completely cleared the log.
Furthermore, each time we create an intent log item, we allocate new
memory and (re)copy the contents; sharing here would be very useful.
Solve the UAF and the unnecessary memory allocations by having the log
code create a single refcounted buffer to contain the name and value
contents. This buffer can be passed from old to new during a relog
operation, and the logging code can (optionally) attach it to the
xfs_attr_item for reuse when LARP mode is enabled.
This also fixes a problem where the xfs_attri_log_item objects weren't
being freed back to the same cache where they came from.
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-23 06:43:46 +08:00
|
|
|
struct xfs_attri_log_nameval;
|
|
|
|
|
2021-04-27 06:00:33 +08:00
|
|
|
/*
|
|
|
|
* Context used for keeping track of delayed attribute operations
|
|
|
|
*/
|
2022-05-22 14:00:26 +08:00
|
|
|
struct xfs_attr_intent {
|
2022-05-22 13:59:48 +08:00
|
|
|
/*
|
|
|
|
* used to log this item to an intent containing a list of attrs to
|
|
|
|
* commit later
|
|
|
|
*/
|
|
|
|
struct list_head xattri_list;
|
|
|
|
|
|
|
|
/* Used in xfs_attr_node_removename to roll through removing blocks */
|
|
|
|
struct xfs_da_state *xattri_da_state;
|
|
|
|
|
2022-05-11 15:01:22 +08:00
|
|
|
struct xfs_da_args *xattri_da_args;
|
2021-04-27 06:00:33 +08:00
|
|
|
|
xfs: share xattr name and value buffers when logging xattr updates
While running xfs/297 and generic/642, I noticed a crash in
xfs_attri_item_relog when it tries to copy the attr name to the new
xattri log item. I think what happened here was that we called
->iop_commit on the old attri item (which nulls out the pointers) as
part of a log force at the same time that a chained attr operation was
ongoing. The system was busy enough that at some later point, the defer
ops operation decided it was necessary to relog the attri log item, but
as we've detached the name buffer from the old attri log item, we can't
copy it to the new one, and kaboom.
I think there's a broader refcounting problem with LARP mode -- the
setxattr code can return to userspace before the CIL actually formats
and commits the log item, which results in a UAF bug. Therefore, the
xattr log item needs to be able to retain a reference to the name and
value buffers until the log items have completely cleared the log.
Furthermore, each time we create an intent log item, we allocate new
memory and (re)copy the contents; sharing here would be very useful.
Solve the UAF and the unnecessary memory allocations by having the log
code create a single refcounted buffer to contain the name and value
contents. This buffer can be passed from old to new during a relog
operation, and the logging code can (optionally) attach it to the
xfs_attr_item for reuse when LARP mode is enabled.
This also fixes a problem where the xfs_attri_log_item objects weren't
being freed back to the same cache where they came from.
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-23 06:43:46 +08:00
|
|
|
/*
|
2024-04-23 00:47:42 +08:00
|
|
|
* Shared buffer containing the attr name, new name, and value so that
|
|
|
|
* the logging code can share large memory buffers between log items.
|
xfs: share xattr name and value buffers when logging xattr updates
While running xfs/297 and generic/642, I noticed a crash in
xfs_attri_item_relog when it tries to copy the attr name to the new
xattri log item. I think what happened here was that we called
->iop_commit on the old attri item (which nulls out the pointers) as
part of a log force at the same time that a chained attr operation was
ongoing. The system was busy enough that at some later point, the defer
ops operation decided it was necessary to relog the attri log item, but
as we've detached the name buffer from the old attri log item, we can't
copy it to the new one, and kaboom.
I think there's a broader refcounting problem with LARP mode -- the
setxattr code can return to userspace before the CIL actually formats
and commits the log item, which results in a UAF bug. Therefore, the
xattr log item needs to be able to retain a reference to the name and
value buffers until the log items have completely cleared the log.
Furthermore, each time we create an intent log item, we allocate new
memory and (re)copy the contents; sharing here would be very useful.
Solve the UAF and the unnecessary memory allocations by having the log
code create a single refcounted buffer to contain the name and value
contents. This buffer can be passed from old to new during a relog
operation, and the logging code can (optionally) attach it to the
xfs_attr_item for reuse when LARP mode is enabled.
This also fixes a problem where the xfs_attri_log_item objects weren't
being freed back to the same cache where they came from.
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-23 06:43:46 +08:00
|
|
|
*/
|
|
|
|
struct xfs_attri_log_nameval *xattri_nameval;
|
|
|
|
|
2021-04-27 06:00:33 +08:00
|
|
|
/* Used to keep track of current state of delayed operation */
|
2022-05-11 15:01:22 +08:00
|
|
|
enum xfs_delattr_state xattri_dela_state;
|
2022-05-04 10:41:02 +08:00
|
|
|
|
|
|
|
/*
|
2022-05-22 13:59:48 +08:00
|
|
|
* Attr operation being performed - XFS_ATTRI_OP_FLAGS_*
|
2022-05-04 10:41:02 +08:00
|
|
|
*/
|
|
|
|
unsigned int xattri_op_flags;
|
|
|
|
|
2022-05-22 13:59:48 +08:00
|
|
|
/* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */
|
|
|
|
xfs_dablk_t xattri_lblkno;
|
|
|
|
int xattri_blkcnt;
|
|
|
|
struct xfs_bmbt_irec xattri_map;
|
2022-05-04 10:41:02 +08:00
|
|
|
};
|
|
|
|
|
2024-04-23 00:47:29 +08:00
|
|
|
static inline unsigned int
|
|
|
|
xfs_attr_intent_op(const struct xfs_attr_intent *attr)
|
|
|
|
{
|
|
|
|
return attr->xattri_op_flags & XFS_ATTRI_OP_FLAGS_TYPE_MASK;
|
|
|
|
}
|
2022-05-04 10:41:02 +08:00
|
|
|
|
2008-06-23 11:23:48 +08:00
|
|
|
/*========================================================================
|
|
|
|
* Function prototypes for the kernel.
|
|
|
|
*========================================================================*/
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Overall external interface routines.
|
|
|
|
*/
|
|
|
|
int xfs_attr_inactive(struct xfs_inode *dp);
|
2020-02-27 09:30:39 +08:00
|
|
|
int xfs_attr_list_ilocked(struct xfs_attr_list_context *);
|
|
|
|
int xfs_attr_list(struct xfs_attr_list_context *);
|
2013-08-12 18:49:38 +08:00
|
|
|
int xfs_inode_hasattr(struct xfs_inode *ip);
|
2021-04-14 02:15:10 +08:00
|
|
|
bool xfs_attr_is_leaf(struct xfs_inode *ip);
|
2020-02-27 09:30:34 +08:00
|
|
|
int xfs_attr_get_ilocked(struct xfs_da_args *args);
|
2020-02-27 09:30:34 +08:00
|
|
|
int xfs_attr_get(struct xfs_da_args *args);
|
2024-04-23 00:47:21 +08:00
|
|
|
|
|
|
|
enum xfs_attr_update {
|
2024-04-23 00:47:22 +08:00
|
|
|
XFS_ATTRUPDATE_REMOVE, /* remove attr */
|
|
|
|
XFS_ATTRUPDATE_UPSERT, /* set value, replace any existing attr */
|
2024-04-23 00:47:21 +08:00
|
|
|
XFS_ATTRUPDATE_CREATE, /* set value, fail if attr already exists */
|
|
|
|
XFS_ATTRUPDATE_REPLACE, /* set value, fail if attr does not exist */
|
|
|
|
};
|
|
|
|
|
2024-04-23 00:48:07 +08:00
|
|
|
int xfs_attr_set(struct xfs_da_args *args, enum xfs_attr_update op, bool rsvd);
|
2022-05-22 14:00:26 +08:00
|
|
|
int xfs_attr_set_iter(struct xfs_attr_intent *attr);
|
|
|
|
int xfs_attr_remove_iter(struct xfs_attr_intent *attr);
|
2024-04-23 00:47:34 +08:00
|
|
|
bool xfs_attr_check_namespace(unsigned int attr_flags);
|
|
|
|
bool xfs_attr_namecheck(unsigned int attr_flags, const void *name,
|
|
|
|
size_t length);
|
2022-05-04 10:41:02 +08:00
|
|
|
int xfs_attr_calc_size(struct xfs_da_args *args, int *local);
|
2024-05-22 14:02:01 +08:00
|
|
|
struct xfs_trans_res xfs_attr_set_resv(const struct xfs_da_args *args);
|
2022-05-04 10:41:02 +08:00
|
|
|
|
xfs: separate out initial attr_set states
We current use XFS_DAS_UNINIT for several steps in the attr_set
state machine. We use it for setting shortform xattrs, converting
from shortform to leaf, leaf add, leaf-to-node and leaf add. All of
these things are essentially known before we start the state machine
iterating, so we really should separate them out:
XFS_DAS_SF_ADD:
- tries to do a shortform add
- on success -> done
- on ENOSPC converts to leaf, -> XFS_DAS_LEAF_ADD
- on error, dies.
XFS_DAS_LEAF_ADD:
- tries to do leaf add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_LBLK
- on ENOSPC converts to node, -> XFS_DAS_NODE_ADD
- on error, dies
XFS_DAS_NODE_ADD:
- tries to do node add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_NBLK
- on error, dies
This makes it easier to understand how the state machine starts
up and sets us up on the path to further state machine
simplifications.
This also converts the DAS state tracepoints to use strings rather
than numbers, as converting between enums and numbers requires
manual counting rather than just reading the name.
This also introduces a XFS_DAS_DONE state so that we can trace
successful operation completions easily.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson<allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 13:12:52 +08:00
|
|
|
/*
|
|
|
|
* Check to see if the attr should be upgraded from non-existent or shortform to
|
|
|
|
* single-leaf-block attribute list.
|
|
|
|
*/
|
|
|
|
static inline bool
|
|
|
|
xfs_attr_is_shortform(
|
|
|
|
struct xfs_inode *ip)
|
|
|
|
{
|
xfs: make inode attribute forks a permanent part of struct xfs_inode
Syzkaller reported a UAF bug a while back:
==================================================================
BUG: KASAN: use-after-free in xfs_ilock_attr_map_shared+0xe3/0xf6 fs/xfs/xfs_inode.c:127
Read of size 4 at addr ffff88802cec919c by task syz-executor262/2958
CPU: 2 PID: 2958 Comm: syz-executor262 Not tainted
5.15.0-0.30.3-20220406_1406 #3
Hardware name: Red Hat KVM, BIOS 1.13.0-2.module+el8.3.0+7860+a7792d29
04/01/2014
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0x82/0xa9 lib/dump_stack.c:106
print_address_description.constprop.9+0x21/0x2d5 mm/kasan/report.c:256
__kasan_report mm/kasan/report.c:442 [inline]
kasan_report.cold.14+0x7f/0x11b mm/kasan/report.c:459
xfs_ilock_attr_map_shared+0xe3/0xf6 fs/xfs/xfs_inode.c:127
xfs_attr_get+0x378/0x4c2 fs/xfs/libxfs/xfs_attr.c:159
xfs_xattr_get+0xe3/0x150 fs/xfs/xfs_xattr.c:36
__vfs_getxattr+0xdf/0x13d fs/xattr.c:399
cap_inode_need_killpriv+0x41/0x5d security/commoncap.c:300
security_inode_need_killpriv+0x4c/0x97 security/security.c:1408
dentry_needs_remove_privs.part.28+0x21/0x63 fs/inode.c:1912
dentry_needs_remove_privs+0x80/0x9e fs/inode.c:1908
do_truncate+0xc3/0x1e0 fs/open.c:56
handle_truncate fs/namei.c:3084 [inline]
do_open fs/namei.c:3432 [inline]
path_openat+0x30ab/0x396d fs/namei.c:3561
do_filp_open+0x1c4/0x290 fs/namei.c:3588
do_sys_openat2+0x60d/0x98c fs/open.c:1212
do_sys_open+0xcf/0x13c fs/open.c:1228
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x3a/0x7e arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0x0
RIP: 0033:0x7f7ef4bb753d
Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48
89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73
01 c3 48 8b 0d 1b 79 2c 00 f7 d8 64 89 01 48
RSP: 002b:00007f7ef52c2ed8 EFLAGS: 00000246 ORIG_RAX: 0000000000000055
RAX: ffffffffffffffda RBX: 0000000000404148 RCX: 00007f7ef4bb753d
RDX: 00007f7ef4bb753d RSI: 0000000000000000 RDI: 0000000020004fc0
RBP: 0000000000404140 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0030656c69662f2e
R13: 00007ffd794db37f R14: 00007ffd794db470 R15: 00007f7ef52c2fc0
</TASK>
Allocated by task 2953:
kasan_save_stack+0x19/0x38 mm/kasan/common.c:38
kasan_set_track mm/kasan/common.c:46 [inline]
set_alloc_info mm/kasan/common.c:434 [inline]
__kasan_slab_alloc+0x68/0x7c mm/kasan/common.c:467
kasan_slab_alloc include/linux/kasan.h:254 [inline]
slab_post_alloc_hook mm/slab.h:519 [inline]
slab_alloc_node mm/slub.c:3213 [inline]
slab_alloc mm/slub.c:3221 [inline]
kmem_cache_alloc+0x11b/0x3eb mm/slub.c:3226
kmem_cache_zalloc include/linux/slab.h:711 [inline]
xfs_ifork_alloc+0x25/0xa2 fs/xfs/libxfs/xfs_inode_fork.c:287
xfs_bmap_add_attrfork+0x3f2/0x9b1 fs/xfs/libxfs/xfs_bmap.c:1098
xfs_attr_set+0xe38/0x12a7 fs/xfs/libxfs/xfs_attr.c:746
xfs_xattr_set+0xeb/0x1a9 fs/xfs/xfs_xattr.c:59
__vfs_setxattr+0x11b/0x177 fs/xattr.c:180
__vfs_setxattr_noperm+0x128/0x5e0 fs/xattr.c:214
__vfs_setxattr_locked+0x1d4/0x258 fs/xattr.c:275
vfs_setxattr+0x154/0x33d fs/xattr.c:301
setxattr+0x216/0x29f fs/xattr.c:575
__do_sys_fsetxattr fs/xattr.c:632 [inline]
__se_sys_fsetxattr fs/xattr.c:621 [inline]
__x64_sys_fsetxattr+0x243/0x2fe fs/xattr.c:621
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x3a/0x7e arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0x0
Freed by task 2949:
kasan_save_stack+0x19/0x38 mm/kasan/common.c:38
kasan_set_track+0x1c/0x21 mm/kasan/common.c:46
kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:360
____kasan_slab_free mm/kasan/common.c:366 [inline]
____kasan_slab_free mm/kasan/common.c:328 [inline]
__kasan_slab_free+0xe2/0x10e mm/kasan/common.c:374
kasan_slab_free include/linux/kasan.h:230 [inline]
slab_free_hook mm/slub.c:1700 [inline]
slab_free_freelist_hook mm/slub.c:1726 [inline]
slab_free mm/slub.c:3492 [inline]
kmem_cache_free+0xdc/0x3ce mm/slub.c:3508
xfs_attr_fork_remove+0x8d/0x132 fs/xfs/libxfs/xfs_attr_leaf.c:773
xfs_attr_sf_removename+0x5dd/0x6cb fs/xfs/libxfs/xfs_attr_leaf.c:822
xfs_attr_remove_iter+0x68c/0x805 fs/xfs/libxfs/xfs_attr.c:1413
xfs_attr_remove_args+0xb1/0x10d fs/xfs/libxfs/xfs_attr.c:684
xfs_attr_set+0xf1e/0x12a7 fs/xfs/libxfs/xfs_attr.c:802
xfs_xattr_set+0xeb/0x1a9 fs/xfs/xfs_xattr.c:59
__vfs_removexattr+0x106/0x16a fs/xattr.c:468
cap_inode_killpriv+0x24/0x47 security/commoncap.c:324
security_inode_killpriv+0x54/0xa1 security/security.c:1414
setattr_prepare+0x1a6/0x897 fs/attr.c:146
xfs_vn_change_ok+0x111/0x15e fs/xfs/xfs_iops.c:682
xfs_vn_setattr_size+0x5f/0x15a fs/xfs/xfs_iops.c:1065
xfs_vn_setattr+0x125/0x2ad fs/xfs/xfs_iops.c:1093
notify_change+0xae5/0x10a1 fs/attr.c:410
do_truncate+0x134/0x1e0 fs/open.c:64
handle_truncate fs/namei.c:3084 [inline]
do_open fs/namei.c:3432 [inline]
path_openat+0x30ab/0x396d fs/namei.c:3561
do_filp_open+0x1c4/0x290 fs/namei.c:3588
do_sys_openat2+0x60d/0x98c fs/open.c:1212
do_sys_open+0xcf/0x13c fs/open.c:1228
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x3a/0x7e arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0x0
The buggy address belongs to the object at ffff88802cec9188
which belongs to the cache xfs_ifork of size 40
The buggy address is located 20 bytes inside of
40-byte region [ffff88802cec9188, ffff88802cec91b0)
The buggy address belongs to the page:
page:00000000c3af36a1 refcount:1 mapcount:0 mapping:0000000000000000
index:0x0 pfn:0x2cec9
flags: 0xfffffc0000200(slab|node=0|zone=1|lastcpupid=0x1fffff)
raw: 000fffffc0000200 ffffea00009d2580 0000000600000006 ffff88801a9ffc80
raw: 0000000000000000 0000000080490049 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected
Memory state around the buggy address:
ffff88802cec9080: fb fb fb fc fc fa fb fb fb fb fc fc fb fb fb fb
ffff88802cec9100: fb fc fc fb fb fb fb fb fc fc fb fb fb fb fb fc
>ffff88802cec9180: fc fa fb fb fb fb fc fc fa fb fb fb fb fc fc fb
^
ffff88802cec9200: fb fb fb fb fc fc fb fb fb fb fb fc fc fb fb fb
ffff88802cec9280: fb fb fc fc fa fb fb fb fb fc fc fa fb fb fb fb
==================================================================
The root cause of this bug is the unlocked access to xfs_inode.i_afp
from the getxattr code paths while trying to determine which ILOCK mode
to use to stabilize the xattr data. Unfortunately, the VFS does not
acquire i_rwsem when vfs_getxattr (or listxattr) call into the
filesystem, which means that getxattr can race with a removexattr that's
tearing down the attr fork and crash:
xfs_attr_set: xfs_attr_get:
xfs_attr_fork_remove: xfs_ilock_attr_map_shared:
xfs_idestroy_fork(ip->i_afp);
kmem_cache_free(xfs_ifork_cache, ip->i_afp);
if (ip->i_afp &&
ip->i_afp = NULL;
xfs_need_iread_extents(ip->i_afp))
<KABOOM>
ip->i_forkoff = 0;
Regrettably, the VFS is much more lax about i_rwsem and getxattr than
is immediately obvious -- not only does it not guarantee that we hold
i_rwsem, it actually doesn't guarantee that we *don't* hold it either.
The getxattr system call won't acquire the lock before calling XFS, but
the file capabilities code calls getxattr with and without i_rwsem held
to determine if the "security.capabilities" xattr is set on the file.
Fixing the VFS locking requires a treewide investigation into every code
path that could touch an xattr and what i_rwsem state it expects or sets
up. That could take years or even prove impossible; fortunately, we
can fix this UAF problem inside XFS.
An earlier version of this patch used smp_wmb in xfs_attr_fork_remove to
ensure that i_forkoff is always zeroed before i_afp is set to null and
changed the read paths to use smp_rmb before accessing i_forkoff and
i_afp, which avoided these UAF problems. However, the patch author was
too busy dealing with other problems in the meantime, and by the time he
came back to this issue, the situation had changed a bit.
On a modern system with selinux, each inode will always have at least
one xattr for the selinux label, so it doesn't make much sense to keep
incurring the extra pointer dereference. Furthermore, Allison's
upcoming parent pointer patchset will also cause nearly every inode in
the filesystem to have extended attributes. Therefore, make the inode
attribute fork structure part of struct xfs_inode, at a cost of 40 more
bytes.
This patch adds a clunky if_present field where necessary to maintain
the existing logic of xattr fork null pointer testing in the existing
codebase. The next patch switches the logic over to XFS_IFORK_Q and it
all goes away.
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
2022-07-10 01:56:06 +08:00
|
|
|
return ip->i_af.if_format == XFS_DINODE_FMT_LOCAL ||
|
|
|
|
(ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS &&
|
|
|
|
ip->i_af.if_nextents == 0);
|
xfs: separate out initial attr_set states
We current use XFS_DAS_UNINIT for several steps in the attr_set
state machine. We use it for setting shortform xattrs, converting
from shortform to leaf, leaf add, leaf-to-node and leaf add. All of
these things are essentially known before we start the state machine
iterating, so we really should separate them out:
XFS_DAS_SF_ADD:
- tries to do a shortform add
- on success -> done
- on ENOSPC converts to leaf, -> XFS_DAS_LEAF_ADD
- on error, dies.
XFS_DAS_LEAF_ADD:
- tries to do leaf add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_LBLK
- on ENOSPC converts to node, -> XFS_DAS_NODE_ADD
- on error, dies
XFS_DAS_NODE_ADD:
- tries to do node add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_NBLK
- on error, dies
This makes it easier to understand how the state machine starts
up and sets us up on the path to further state machine
simplifications.
This also converts the DAS state tracepoints to use strings rather
than numbers, as converting between enums and numbers requires
manual counting rather than just reading the name.
This also introduces a XFS_DAS_DONE state so that we can trace
successful operation completions easily.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson<allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 13:12:52 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline enum xfs_delattr_state
|
|
|
|
xfs_attr_init_add_state(struct xfs_da_args *args)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* When called from the completion of a attr remove to determine the
|
|
|
|
* next state, the attribute fork may be null. This can occur only occur
|
|
|
|
* on a pure remove, but we grab the next state before we check if a
|
|
|
|
* replace operation is being performed. If we are called from any other
|
xfs: make inode attribute forks a permanent part of struct xfs_inode
Syzkaller reported a UAF bug a while back:
==================================================================
BUG: KASAN: use-after-free in xfs_ilock_attr_map_shared+0xe3/0xf6 fs/xfs/xfs_inode.c:127
Read of size 4 at addr ffff88802cec919c by task syz-executor262/2958
CPU: 2 PID: 2958 Comm: syz-executor262 Not tainted
5.15.0-0.30.3-20220406_1406 #3
Hardware name: Red Hat KVM, BIOS 1.13.0-2.module+el8.3.0+7860+a7792d29
04/01/2014
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0x82/0xa9 lib/dump_stack.c:106
print_address_description.constprop.9+0x21/0x2d5 mm/kasan/report.c:256
__kasan_report mm/kasan/report.c:442 [inline]
kasan_report.cold.14+0x7f/0x11b mm/kasan/report.c:459
xfs_ilock_attr_map_shared+0xe3/0xf6 fs/xfs/xfs_inode.c:127
xfs_attr_get+0x378/0x4c2 fs/xfs/libxfs/xfs_attr.c:159
xfs_xattr_get+0xe3/0x150 fs/xfs/xfs_xattr.c:36
__vfs_getxattr+0xdf/0x13d fs/xattr.c:399
cap_inode_need_killpriv+0x41/0x5d security/commoncap.c:300
security_inode_need_killpriv+0x4c/0x97 security/security.c:1408
dentry_needs_remove_privs.part.28+0x21/0x63 fs/inode.c:1912
dentry_needs_remove_privs+0x80/0x9e fs/inode.c:1908
do_truncate+0xc3/0x1e0 fs/open.c:56
handle_truncate fs/namei.c:3084 [inline]
do_open fs/namei.c:3432 [inline]
path_openat+0x30ab/0x396d fs/namei.c:3561
do_filp_open+0x1c4/0x290 fs/namei.c:3588
do_sys_openat2+0x60d/0x98c fs/open.c:1212
do_sys_open+0xcf/0x13c fs/open.c:1228
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x3a/0x7e arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0x0
RIP: 0033:0x7f7ef4bb753d
Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48
89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73
01 c3 48 8b 0d 1b 79 2c 00 f7 d8 64 89 01 48
RSP: 002b:00007f7ef52c2ed8 EFLAGS: 00000246 ORIG_RAX: 0000000000000055
RAX: ffffffffffffffda RBX: 0000000000404148 RCX: 00007f7ef4bb753d
RDX: 00007f7ef4bb753d RSI: 0000000000000000 RDI: 0000000020004fc0
RBP: 0000000000404140 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0030656c69662f2e
R13: 00007ffd794db37f R14: 00007ffd794db470 R15: 00007f7ef52c2fc0
</TASK>
Allocated by task 2953:
kasan_save_stack+0x19/0x38 mm/kasan/common.c:38
kasan_set_track mm/kasan/common.c:46 [inline]
set_alloc_info mm/kasan/common.c:434 [inline]
__kasan_slab_alloc+0x68/0x7c mm/kasan/common.c:467
kasan_slab_alloc include/linux/kasan.h:254 [inline]
slab_post_alloc_hook mm/slab.h:519 [inline]
slab_alloc_node mm/slub.c:3213 [inline]
slab_alloc mm/slub.c:3221 [inline]
kmem_cache_alloc+0x11b/0x3eb mm/slub.c:3226
kmem_cache_zalloc include/linux/slab.h:711 [inline]
xfs_ifork_alloc+0x25/0xa2 fs/xfs/libxfs/xfs_inode_fork.c:287
xfs_bmap_add_attrfork+0x3f2/0x9b1 fs/xfs/libxfs/xfs_bmap.c:1098
xfs_attr_set+0xe38/0x12a7 fs/xfs/libxfs/xfs_attr.c:746
xfs_xattr_set+0xeb/0x1a9 fs/xfs/xfs_xattr.c:59
__vfs_setxattr+0x11b/0x177 fs/xattr.c:180
__vfs_setxattr_noperm+0x128/0x5e0 fs/xattr.c:214
__vfs_setxattr_locked+0x1d4/0x258 fs/xattr.c:275
vfs_setxattr+0x154/0x33d fs/xattr.c:301
setxattr+0x216/0x29f fs/xattr.c:575
__do_sys_fsetxattr fs/xattr.c:632 [inline]
__se_sys_fsetxattr fs/xattr.c:621 [inline]
__x64_sys_fsetxattr+0x243/0x2fe fs/xattr.c:621
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x3a/0x7e arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0x0
Freed by task 2949:
kasan_save_stack+0x19/0x38 mm/kasan/common.c:38
kasan_set_track+0x1c/0x21 mm/kasan/common.c:46
kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:360
____kasan_slab_free mm/kasan/common.c:366 [inline]
____kasan_slab_free mm/kasan/common.c:328 [inline]
__kasan_slab_free+0xe2/0x10e mm/kasan/common.c:374
kasan_slab_free include/linux/kasan.h:230 [inline]
slab_free_hook mm/slub.c:1700 [inline]
slab_free_freelist_hook mm/slub.c:1726 [inline]
slab_free mm/slub.c:3492 [inline]
kmem_cache_free+0xdc/0x3ce mm/slub.c:3508
xfs_attr_fork_remove+0x8d/0x132 fs/xfs/libxfs/xfs_attr_leaf.c:773
xfs_attr_sf_removename+0x5dd/0x6cb fs/xfs/libxfs/xfs_attr_leaf.c:822
xfs_attr_remove_iter+0x68c/0x805 fs/xfs/libxfs/xfs_attr.c:1413
xfs_attr_remove_args+0xb1/0x10d fs/xfs/libxfs/xfs_attr.c:684
xfs_attr_set+0xf1e/0x12a7 fs/xfs/libxfs/xfs_attr.c:802
xfs_xattr_set+0xeb/0x1a9 fs/xfs/xfs_xattr.c:59
__vfs_removexattr+0x106/0x16a fs/xattr.c:468
cap_inode_killpriv+0x24/0x47 security/commoncap.c:324
security_inode_killpriv+0x54/0xa1 security/security.c:1414
setattr_prepare+0x1a6/0x897 fs/attr.c:146
xfs_vn_change_ok+0x111/0x15e fs/xfs/xfs_iops.c:682
xfs_vn_setattr_size+0x5f/0x15a fs/xfs/xfs_iops.c:1065
xfs_vn_setattr+0x125/0x2ad fs/xfs/xfs_iops.c:1093
notify_change+0xae5/0x10a1 fs/attr.c:410
do_truncate+0x134/0x1e0 fs/open.c:64
handle_truncate fs/namei.c:3084 [inline]
do_open fs/namei.c:3432 [inline]
path_openat+0x30ab/0x396d fs/namei.c:3561
do_filp_open+0x1c4/0x290 fs/namei.c:3588
do_sys_openat2+0x60d/0x98c fs/open.c:1212
do_sys_open+0xcf/0x13c fs/open.c:1228
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x3a/0x7e arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0x0
The buggy address belongs to the object at ffff88802cec9188
which belongs to the cache xfs_ifork of size 40
The buggy address is located 20 bytes inside of
40-byte region [ffff88802cec9188, ffff88802cec91b0)
The buggy address belongs to the page:
page:00000000c3af36a1 refcount:1 mapcount:0 mapping:0000000000000000
index:0x0 pfn:0x2cec9
flags: 0xfffffc0000200(slab|node=0|zone=1|lastcpupid=0x1fffff)
raw: 000fffffc0000200 ffffea00009d2580 0000000600000006 ffff88801a9ffc80
raw: 0000000000000000 0000000080490049 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected
Memory state around the buggy address:
ffff88802cec9080: fb fb fb fc fc fa fb fb fb fb fc fc fb fb fb fb
ffff88802cec9100: fb fc fc fb fb fb fb fb fc fc fb fb fb fb fb fc
>ffff88802cec9180: fc fa fb fb fb fb fc fc fa fb fb fb fb fc fc fb
^
ffff88802cec9200: fb fb fb fb fc fc fb fb fb fb fb fc fc fb fb fb
ffff88802cec9280: fb fb fc fc fa fb fb fb fb fc fc fa fb fb fb fb
==================================================================
The root cause of this bug is the unlocked access to xfs_inode.i_afp
from the getxattr code paths while trying to determine which ILOCK mode
to use to stabilize the xattr data. Unfortunately, the VFS does not
acquire i_rwsem when vfs_getxattr (or listxattr) call into the
filesystem, which means that getxattr can race with a removexattr that's
tearing down the attr fork and crash:
xfs_attr_set: xfs_attr_get:
xfs_attr_fork_remove: xfs_ilock_attr_map_shared:
xfs_idestroy_fork(ip->i_afp);
kmem_cache_free(xfs_ifork_cache, ip->i_afp);
if (ip->i_afp &&
ip->i_afp = NULL;
xfs_need_iread_extents(ip->i_afp))
<KABOOM>
ip->i_forkoff = 0;
Regrettably, the VFS is much more lax about i_rwsem and getxattr than
is immediately obvious -- not only does it not guarantee that we hold
i_rwsem, it actually doesn't guarantee that we *don't* hold it either.
The getxattr system call won't acquire the lock before calling XFS, but
the file capabilities code calls getxattr with and without i_rwsem held
to determine if the "security.capabilities" xattr is set on the file.
Fixing the VFS locking requires a treewide investigation into every code
path that could touch an xattr and what i_rwsem state it expects or sets
up. That could take years or even prove impossible; fortunately, we
can fix this UAF problem inside XFS.
An earlier version of this patch used smp_wmb in xfs_attr_fork_remove to
ensure that i_forkoff is always zeroed before i_afp is set to null and
changed the read paths to use smp_rmb before accessing i_forkoff and
i_afp, which avoided these UAF problems. However, the patch author was
too busy dealing with other problems in the meantime, and by the time he
came back to this issue, the situation had changed a bit.
On a modern system with selinux, each inode will always have at least
one xattr for the selinux label, so it doesn't make much sense to keep
incurring the extra pointer dereference. Furthermore, Allison's
upcoming parent pointer patchset will also cause nearly every inode in
the filesystem to have extended attributes. Therefore, make the inode
attribute fork structure part of struct xfs_inode, at a cost of 40 more
bytes.
This patch adds a clunky if_present field where necessary to maintain
the existing logic of xattr fork null pointer testing in the existing
codebase. The next patch switches the logic over to XFS_IFORK_Q and it
all goes away.
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
2022-07-10 01:56:06 +08:00
|
|
|
* context, i_af is guaranteed to exist. Hence if the attr fork is
|
xfs: separate out initial attr_set states
We current use XFS_DAS_UNINIT for several steps in the attr_set
state machine. We use it for setting shortform xattrs, converting
from shortform to leaf, leaf add, leaf-to-node and leaf add. All of
these things are essentially known before we start the state machine
iterating, so we really should separate them out:
XFS_DAS_SF_ADD:
- tries to do a shortform add
- on success -> done
- on ENOSPC converts to leaf, -> XFS_DAS_LEAF_ADD
- on error, dies.
XFS_DAS_LEAF_ADD:
- tries to do leaf add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_LBLK
- on ENOSPC converts to node, -> XFS_DAS_NODE_ADD
- on error, dies
XFS_DAS_NODE_ADD:
- tries to do node add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_NBLK
- on error, dies
This makes it easier to understand how the state machine starts
up and sets us up on the path to further state machine
simplifications.
This also converts the DAS state tracepoints to use strings rather
than numbers, as converting between enums and numbers requires
manual counting rather than just reading the name.
This also introduces a XFS_DAS_DONE state so that we can trace
successful operation completions easily.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson<allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 13:12:52 +08:00
|
|
|
* null, we were called from a pure remove operation and so we are done.
|
|
|
|
*/
|
2022-07-10 01:56:06 +08:00
|
|
|
if (!xfs_inode_has_attr_fork(args->dp))
|
xfs: separate out initial attr_set states
We current use XFS_DAS_UNINIT for several steps in the attr_set
state machine. We use it for setting shortform xattrs, converting
from shortform to leaf, leaf add, leaf-to-node and leaf add. All of
these things are essentially known before we start the state machine
iterating, so we really should separate them out:
XFS_DAS_SF_ADD:
- tries to do a shortform add
- on success -> done
- on ENOSPC converts to leaf, -> XFS_DAS_LEAF_ADD
- on error, dies.
XFS_DAS_LEAF_ADD:
- tries to do leaf add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_LBLK
- on ENOSPC converts to node, -> XFS_DAS_NODE_ADD
- on error, dies
XFS_DAS_NODE_ADD:
- tries to do node add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_NBLK
- on error, dies
This makes it easier to understand how the state machine starts
up and sets us up on the path to further state machine
simplifications.
This also converts the DAS state tracepoints to use strings rather
than numbers, as converting between enums and numbers requires
manual counting rather than just reading the name.
This also introduces a XFS_DAS_DONE state so that we can trace
successful operation completions easily.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson<allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 13:12:52 +08:00
|
|
|
return XFS_DAS_DONE;
|
xfs: use XFS_DA_OP flags in deferred attr ops
We currently store the high level attr operation in
args->attr_flags. This field contains what the VFS is telling us to
do, but don't necessarily match what we are doing in the low level
modification state machine. e.g. XATTR_REPLACE implies both
XFS_DA_OP_ADDNAME and XFS_DA_OP_RENAME because it is doing both a
remove and adding a new attr.
However, deep in the individual state machine operations, we check
errors against this high level VFS op flags, not the low level
XFS_DA_OP flags. Indeed, we don't even have a low level flag for
a REMOVE operation, so the only way we know we are doing a remove
is the complete absence of XATTR_REPLACE, XATTR_CREATE,
XFS_DA_OP_ADDNAME and XFS_DA_OP_RENAME. And because there are other
flags in these fields, this is a pain to check if we need to.
As the XFS_DA_OP flags are only needed once the deferred operations
are set up, set these flags appropriately when we set the initial
operation state. We also introduce a XFS_DA_OP_REMOVE flag to make
it easy to know that we are doing a remove operation.
With these, we can remove the use of XATTR_REPLACE and XATTR_CREATE
in low level lookup operations, and manipulate the low level flags
according to the low level context that is operating. e.g. log
recovery does not have a VFS xattr operation state to copy into
args->attr_flags, and the low level state machine ops we do for
recovery do not match the high level VFS operations that were in
progress when the system failed...
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 13:12:56 +08:00
|
|
|
|
|
|
|
args->op_flags |= XFS_DA_OP_ADDNAME;
|
xfs: separate out initial attr_set states
We current use XFS_DAS_UNINIT for several steps in the attr_set
state machine. We use it for setting shortform xattrs, converting
from shortform to leaf, leaf add, leaf-to-node and leaf add. All of
these things are essentially known before we start the state machine
iterating, so we really should separate them out:
XFS_DAS_SF_ADD:
- tries to do a shortform add
- on success -> done
- on ENOSPC converts to leaf, -> XFS_DAS_LEAF_ADD
- on error, dies.
XFS_DAS_LEAF_ADD:
- tries to do leaf add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_LBLK
- on ENOSPC converts to node, -> XFS_DAS_NODE_ADD
- on error, dies
XFS_DAS_NODE_ADD:
- tries to do node add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_NBLK
- on error, dies
This makes it easier to understand how the state machine starts
up and sets us up on the path to further state machine
simplifications.
This also converts the DAS state tracepoints to use strings rather
than numbers, as converting between enums and numbers requires
manual counting rather than just reading the name.
This also introduces a XFS_DAS_DONE state so that we can trace
successful operation completions easily.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson<allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 13:12:52 +08:00
|
|
|
if (xfs_attr_is_shortform(args->dp))
|
|
|
|
return XFS_DAS_SF_ADD;
|
|
|
|
if (xfs_attr_is_leaf(args->dp))
|
|
|
|
return XFS_DAS_LEAF_ADD;
|
|
|
|
return XFS_DAS_NODE_ADD;
|
|
|
|
}
|
|
|
|
|
2022-05-12 13:12:56 +08:00
|
|
|
static inline enum xfs_delattr_state
|
|
|
|
xfs_attr_init_remove_state(struct xfs_da_args *args)
|
|
|
|
{
|
|
|
|
if (xfs_attr_is_shortform(args->dp))
|
|
|
|
return XFS_DAS_SF_REMOVE;
|
|
|
|
if (xfs_attr_is_leaf(args->dp))
|
|
|
|
return XFS_DAS_LEAF_REMOVE;
|
|
|
|
return XFS_DAS_NODE_REMOVE;
|
|
|
|
}
|
|
|
|
|
xfs: ATTR_REPLACE algorithm with LARP enabled needs rework
We can't use the same algorithm for replacing an existing attribute
when logging attributes. The existing algorithm is essentially:
1. create new attr w/ INCOMPLETE
2. atomically flip INCOMPLETE flags between old + new attribute
3. remove old attr which is marked w/ INCOMPLETE
This algorithm guarantees that we see either the old or new
attribute, and if we fail after the atomic flag flip, we don't have
to recover the removal of the old attr because we never see
INCOMPLETE attributes in lookups.
For logged attributes, however, this does not work. The logged
attribute intents do not track the work that has been done as the
transaction rolls, and hence the only recovery mechanism we have is
"run the replace operation from scratch".
This is further exacerbated by the attempt to avoid needing the
INCOMPLETE flag to create an atomic swap. This means we can create
a second active attribute of the same name before we remove the
original. If we fail at any point after the create but before the
removal has completed, we end up with duplicate attributes in
the attr btree and recovery only tries to replace one of them.
There are several other failure modes where we can leave partially
allocated remote attributes that expose stale data, partially free
remote attributes that enable UAF based stale data exposure, etc.
TO fix this, we need a different algorithm for replace operations
when LARP is enabled. Luckily, it's not that complex if we take the
right first step. That is, the first thing we log is the attri
intent with the new name/value pair and mark the old attr as
INCOMPLETE in the same transaction.
From there, we then remove the old attr and keep relogging the
new name/value in the intent, such that we always know that we have
to create the new attr in recovery. Once the old attr is removed,
we then run a normal ATTR_CREATE operation relogging the intent as
we go. If the new attr is local, then it gets created in a single
atomic transaction that also logs the final intent done. If the new
attr is remote, the we set INCOMPLETE on the new attr while we
allocate and set the remote value, and then we clear the INCOMPLETE
flag at in the last transaction taht logs the final intent done.
If we fail at any point in this algorithm, log recovery will always
see the same state on disk: the new name/value in the intent, and
either an INCOMPLETE attr or no attr in the attr btree. If we find
an INCOMPLETE attr, we run the full replace starting with removing
the INCOMPLETE attr. If we don't find it, then we simply create the
new attr.
Notably, recovery of a failed create that has an INCOMPLETE flag set
is now the same - we start with the lookup of the INCOMPLETE attr,
and if that exists then we do the full replace recovery process,
otherwise we just create the new attr.
Hence changing the way we do the replace operation when LARP is
enabled allows us to use the same log recovery algorithm for both
the ATTR_CREATE and ATTR_REPLACE operations. This is also the same
algorithm we use for runtime ATTR_REPLACE operations (except for the
step setting up the initial conditions).
The result is that:
- ATTR_CREATE uses the same algorithm regardless of whether LARP is
enabled or not
- ATTR_REPLACE with larp=0 is identical to the old algorithm
- ATTR_REPLACE with larp=1 runs an unmodified attr removal algorithm
from the larp=0 code and then runs the unmodified ATTR_CREATE
code.
- log recovery when larp=1 runs the same ATTR_REPLACE algorithm as
it uses at runtime.
Because the state machine is now quite clean, changing the algorithm
is really just a case of changing the initial state and how the
states link together for the ATTR_REPLACE case. Hence it's not a
huge amount of code for what is a fairly substantial rework
of the attr logging and recovery algorithm....
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 13:12:56 +08:00
|
|
|
/*
|
|
|
|
* If we are logging the attributes, then we have to start with removal of the
|
|
|
|
* old attribute so that there is always consistent state that we can recover
|
|
|
|
* from if the system goes down part way through. We always log the new attr
|
|
|
|
* value, so even when we remove the attr first we still have the information in
|
|
|
|
* the log to finish the replace operation atomically.
|
|
|
|
*/
|
xfs: separate out initial attr_set states
We current use XFS_DAS_UNINIT for several steps in the attr_set
state machine. We use it for setting shortform xattrs, converting
from shortform to leaf, leaf add, leaf-to-node and leaf add. All of
these things are essentially known before we start the state machine
iterating, so we really should separate them out:
XFS_DAS_SF_ADD:
- tries to do a shortform add
- on success -> done
- on ENOSPC converts to leaf, -> XFS_DAS_LEAF_ADD
- on error, dies.
XFS_DAS_LEAF_ADD:
- tries to do leaf add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_LBLK
- on ENOSPC converts to node, -> XFS_DAS_NODE_ADD
- on error, dies
XFS_DAS_NODE_ADD:
- tries to do node add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_NBLK
- on error, dies
This makes it easier to understand how the state machine starts
up and sets us up on the path to further state machine
simplifications.
This also converts the DAS state tracepoints to use strings rather
than numbers, as converting between enums and numbers requires
manual counting rather than just reading the name.
This also introduces a XFS_DAS_DONE state so that we can trace
successful operation completions easily.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson<allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 13:12:52 +08:00
|
|
|
static inline enum xfs_delattr_state
|
|
|
|
xfs_attr_init_replace_state(struct xfs_da_args *args)
|
|
|
|
{
|
xfs: use XFS_DA_OP flags in deferred attr ops
We currently store the high level attr operation in
args->attr_flags. This field contains what the VFS is telling us to
do, but don't necessarily match what we are doing in the low level
modification state machine. e.g. XATTR_REPLACE implies both
XFS_DA_OP_ADDNAME and XFS_DA_OP_RENAME because it is doing both a
remove and adding a new attr.
However, deep in the individual state machine operations, we check
errors against this high level VFS op flags, not the low level
XFS_DA_OP flags. Indeed, we don't even have a low level flag for
a REMOVE operation, so the only way we know we are doing a remove
is the complete absence of XATTR_REPLACE, XATTR_CREATE,
XFS_DA_OP_ADDNAME and XFS_DA_OP_RENAME. And because there are other
flags in these fields, this is a pain to check if we need to.
As the XFS_DA_OP flags are only needed once the deferred operations
are set up, set these flags appropriately when we set the initial
operation state. We also introduce a XFS_DA_OP_REMOVE flag to make
it easy to know that we are doing a remove operation.
With these, we can remove the use of XATTR_REPLACE and XATTR_CREATE
in low level lookup operations, and manipulate the low level flags
according to the low level context that is operating. e.g. log
recovery does not have a VFS xattr operation state to copy into
args->attr_flags, and the low level state machine ops we do for
recovery do not match the high level VFS operations that were in
progress when the system failed...
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 13:12:56 +08:00
|
|
|
args->op_flags |= XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE;
|
xfs: fix TOCTOU race involving the new logged xattrs control knob
I found a race involving the larp control knob, aka the debugging knob
that lets developers enable logging of extended attribute updates:
Thread 1 Thread 2
echo 0 > /sys/fs/xfs/debug/larp
setxattr(REPLACE)
xfs_has_larp (returns false)
xfs_attr_set
echo 1 > /sys/fs/xfs/debug/larp
xfs_attr_defer_replace
xfs_attr_init_replace_state
xfs_has_larp (returns true)
xfs_attr_init_remove_state
<oops, wrong DAS state!>
This isn't a particularly severe problem right now because xattr logging
is only enabled when CONFIG_XFS_DEBUG=y, and developers *should* know
what they're doing.
However, the eventual intent is that callers should be able to ask for
the assistance of the log in persisting xattr updates. This capability
might not be required for /all/ callers, which means that dynamic
control must work correctly. Once an xattr update has decided whether
or not to use logged xattrs, it needs to stay in that mode until the end
of the operation regardless of what subsequent parallel operations might
do.
Therefore, it is an error to continue sampling xfs_globals.larp once
xfs_attr_change has made a decision about larp, and it was not correct
for me to have told Allison that ->create_intent functions can sample
the global log incompat feature bitfield to decide to elide a log item.
Instead, create a new op flag for the xfs_da_args structure, and convert
all other callers of xfs_has_larp and xfs_sb_version_haslogxattrs within
the attr update state machine to look for the operations flag.
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
2022-06-06 09:51:22 +08:00
|
|
|
if (args->op_flags & XFS_DA_OP_LOGGED)
|
xfs: ATTR_REPLACE algorithm with LARP enabled needs rework
We can't use the same algorithm for replacing an existing attribute
when logging attributes. The existing algorithm is essentially:
1. create new attr w/ INCOMPLETE
2. atomically flip INCOMPLETE flags between old + new attribute
3. remove old attr which is marked w/ INCOMPLETE
This algorithm guarantees that we see either the old or new
attribute, and if we fail after the atomic flag flip, we don't have
to recover the removal of the old attr because we never see
INCOMPLETE attributes in lookups.
For logged attributes, however, this does not work. The logged
attribute intents do not track the work that has been done as the
transaction rolls, and hence the only recovery mechanism we have is
"run the replace operation from scratch".
This is further exacerbated by the attempt to avoid needing the
INCOMPLETE flag to create an atomic swap. This means we can create
a second active attribute of the same name before we remove the
original. If we fail at any point after the create but before the
removal has completed, we end up with duplicate attributes in
the attr btree and recovery only tries to replace one of them.
There are several other failure modes where we can leave partially
allocated remote attributes that expose stale data, partially free
remote attributes that enable UAF based stale data exposure, etc.
TO fix this, we need a different algorithm for replace operations
when LARP is enabled. Luckily, it's not that complex if we take the
right first step. That is, the first thing we log is the attri
intent with the new name/value pair and mark the old attr as
INCOMPLETE in the same transaction.
From there, we then remove the old attr and keep relogging the
new name/value in the intent, such that we always know that we have
to create the new attr in recovery. Once the old attr is removed,
we then run a normal ATTR_CREATE operation relogging the intent as
we go. If the new attr is local, then it gets created in a single
atomic transaction that also logs the final intent done. If the new
attr is remote, the we set INCOMPLETE on the new attr while we
allocate and set the remote value, and then we clear the INCOMPLETE
flag at in the last transaction taht logs the final intent done.
If we fail at any point in this algorithm, log recovery will always
see the same state on disk: the new name/value in the intent, and
either an INCOMPLETE attr or no attr in the attr btree. If we find
an INCOMPLETE attr, we run the full replace starting with removing
the INCOMPLETE attr. If we don't find it, then we simply create the
new attr.
Notably, recovery of a failed create that has an INCOMPLETE flag set
is now the same - we start with the lookup of the INCOMPLETE attr,
and if that exists then we do the full replace recovery process,
otherwise we just create the new attr.
Hence changing the way we do the replace operation when LARP is
enabled allows us to use the same log recovery algorithm for both
the ATTR_CREATE and ATTR_REPLACE operations. This is also the same
algorithm we use for runtime ATTR_REPLACE operations (except for the
step setting up the initial conditions).
The result is that:
- ATTR_CREATE uses the same algorithm regardless of whether LARP is
enabled or not
- ATTR_REPLACE with larp=0 is identical to the old algorithm
- ATTR_REPLACE with larp=1 runs an unmodified attr removal algorithm
from the larp=0 code and then runs the unmodified ATTR_CREATE
code.
- log recovery when larp=1 runs the same ATTR_REPLACE algorithm as
it uses at runtime.
Because the state machine is now quite clean, changing the algorithm
is really just a case of changing the initial state and how the
states link together for the ATTR_REPLACE case. Hence it's not a
huge amount of code for what is a fairly substantial rework
of the attr logging and recovery algorithm....
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 13:12:56 +08:00
|
|
|
return xfs_attr_init_remove_state(args);
|
xfs: separate out initial attr_set states
We current use XFS_DAS_UNINIT for several steps in the attr_set
state machine. We use it for setting shortform xattrs, converting
from shortform to leaf, leaf add, leaf-to-node and leaf add. All of
these things are essentially known before we start the state machine
iterating, so we really should separate them out:
XFS_DAS_SF_ADD:
- tries to do a shortform add
- on success -> done
- on ENOSPC converts to leaf, -> XFS_DAS_LEAF_ADD
- on error, dies.
XFS_DAS_LEAF_ADD:
- tries to do leaf add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_LBLK
- on ENOSPC converts to node, -> XFS_DAS_NODE_ADD
- on error, dies
XFS_DAS_NODE_ADD:
- tries to do node add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_NBLK
- on error, dies
This makes it easier to understand how the state machine starts
up and sets us up on the path to further state machine
simplifications.
This also converts the DAS state tracepoints to use strings rather
than numbers, as converting between enums and numbers requires
manual counting rather than just reading the name.
This also introduces a XFS_DAS_DONE state so that we can trace
successful operation completions easily.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson<allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 13:12:52 +08:00
|
|
|
return xfs_attr_init_add_state(args);
|
|
|
|
}
|
|
|
|
|
2024-04-23 00:47:38 +08:00
|
|
|
xfs_dahash_t xfs_attr_hashname(const uint8_t *name, int namelen);
|
|
|
|
|
|
|
|
xfs_dahash_t xfs_attr_hashval(struct xfs_mount *mp, unsigned int attr_flags,
|
|
|
|
const uint8_t *name, int namelen, const void *value,
|
|
|
|
int valuelen);
|
|
|
|
|
|
|
|
/* Set the hash value for any extended attribute from any namespace. */
|
|
|
|
static inline void xfs_attr_sethash(struct xfs_da_args *args)
|
|
|
|
{
|
|
|
|
args->hashval = xfs_attr_hashval(args->dp->i_mount, args->attr_filter,
|
|
|
|
args->name, args->namelen,
|
|
|
|
args->value, args->valuelen);
|
|
|
|
}
|
|
|
|
|
2022-05-22 13:59:48 +08:00
|
|
|
extern struct kmem_cache *xfs_attr_intent_cache;
|
|
|
|
int __init xfs_attr_intent_init_cache(void);
|
|
|
|
void xfs_attr_intent_destroy_cache(void);
|
|
|
|
|
2024-04-16 05:54:45 +08:00
|
|
|
int xfs_attr_sf_totsize(struct xfs_inode *dp);
|
2024-04-23 00:48:16 +08:00
|
|
|
int xfs_attr_add_fork(struct xfs_inode *ip, int size, int rsvd);
|
2024-04-16 05:54:45 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif /* __XFS_ATTR_H__ */
|