From 695b46e76b62447e506cddc87e088236498008e5 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 15 Mar 2018 23:39:01 +0200 Subject: [PATCH 01/19] ovl: set i_ino to the value of st_ino for NFS export Eddie Horng reported that readdir of an overlayfs directory that was exported via NFSv3 returns entries with d_type set to DT_UNKNOWN. The reason is that while preparing the response for readdirplus, nfsd checks inside encode_entryplus_baggage() that a child dentry's inode number matches the value of d_ino returns by overlayfs readdir iterator. Because the overlayfs inodes use arbitrary inode numbers that are not correlated with the values of st_ino/d_ino, NFSv3 falls back to not encoding d_type. Although this is an allowed behavior, we can fix it for the case of all overlayfs layers on the same underlying filesystem. When NFS export is enabled and d_ino is consistent with st_ino (samefs), set the same value also to i_ino in ovl_fill_inode() for all overlayfs inodes, nfsd readdirplus sanity checks will pass. ovl_fill_inode() may be called from ovl_new_inode(), before real inode was created with ino arg 0. In that case, i_ino will be updated to real upper inode i_ino on ovl_inode_init() or ovl_inode_update(). Reported-by: Eddie Horng Tested-by: Eddie Horng Signed-off-by: Amir Goldstein Fixes: 8383f1748829 ("ovl: wire up NFS export operations") Cc: #v4.16 Signed-off-by: Miklos Szeredi --- fs/overlayfs/inode.c | 21 +++++++++++++++++---- fs/overlayfs/util.c | 8 +++++++- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 3b1bd469accd..4689716f23d8 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -459,9 +459,20 @@ static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode) #endif } -static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev) +static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev, + unsigned long ino) { - inode->i_ino = get_next_ino(); + /* + * When NFS export is enabled and d_ino is consistent with st_ino + * (samefs), set the same value to i_ino, because nfsd readdirplus + * compares d_ino values to i_ino values of child entries. When called + * from ovl_new_inode(), ino arg is 0, so i_ino will be updated to real + * upper inode i_ino on ovl_inode_init() or ovl_inode_update(). + */ + if (inode->i_sb->s_export_op && ovl_same_sb(inode->i_sb)) + inode->i_ino = ino; + else + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_flags |= S_NOCMTIME; #ifdef CONFIG_FS_POSIX_ACL @@ -597,7 +608,7 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev) inode = new_inode(sb); if (inode) - ovl_fill_inode(inode, mode, rdev); + ovl_fill_inode(inode, mode, rdev, 0); return inode; } @@ -710,6 +721,7 @@ struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, struct inode *inode; bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, index); bool is_dir; + unsigned long ino = 0; if (!realinode) realinode = d_inode(lowerdentry); @@ -748,13 +760,14 @@ struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, if (!is_dir) nlink = ovl_get_nlink(lowerdentry, upperdentry, nlink); set_nlink(inode, nlink); + ino = key->i_ino; } else { /* Lower hardlink that will be broken on copy up */ inode = new_inode(sb); if (!inode) goto out_nomem; } - ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev); + ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev, ino); ovl_inode_init(inode, upperdentry, lowerdentry); if (upperdentry && ovl_is_impuredir(upperdentry)) diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 930784a26623..493f9b76fbf6 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -279,12 +279,16 @@ void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect) void ovl_inode_init(struct inode *inode, struct dentry *upperdentry, struct dentry *lowerdentry) { + struct inode *realinode = d_inode(upperdentry ?: lowerdentry); + if (upperdentry) OVL_I(inode)->__upperdentry = upperdentry; if (lowerdentry) OVL_I(inode)->lower = igrab(d_inode(lowerdentry)); - ovl_copyattr(d_inode(upperdentry ?: lowerdentry), inode); + ovl_copyattr(realinode, inode); + if (!inode->i_ino) + inode->i_ino = realinode->i_ino; } void ovl_inode_update(struct inode *inode, struct dentry *upperdentry) @@ -299,6 +303,8 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry) smp_wmb(); OVL_I(inode)->__upperdentry = upperdentry; if (inode_unhashed(inode)) { + if (!inode->i_ino) + inode->i_ino = upperinode->i_ino; inode->i_private = upperinode; __insert_inode_hash(inode, (unsigned long) upperinode); } From 452061fd4521b2bf3225fc391dbe536e5f9c05e2 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 9 Mar 2018 15:44:41 -0500 Subject: [PATCH 02/19] ovl: Set d->last properly during lookup d->last signifies that this is the last layer we are looking into and there is no more. And that means this allows for some optimzation opportunities during lookup. For example, in ovl_lookup_single() we don't have to check for opaque xattr of a directory is this is the last layer we are looking into (d->last = true). But knowing for sure whether we are looking into last layer can be very tricky. If redirects are not enabled, then we can look at poe->numlower and figure out if the lookup we are about to is last layer or not. But if redircts are enabled then it is possible poe->numlower suggests that we are looking in last layer, but there is an absolute redirect present in found element and that redirects us to a layer in root and that means lookup will continue in lower layers further. For example, consider following. /upperdir/pure (opaque=y) /upperdir/pure/foo (opaque=y,redirect=/bar) /lowerdir/bar In this case pure is "pure upper". When we look for "foo", that time poe->numlower=0. But that alone does not mean that we will not search for a merge candidate in /lowerdir. Absolute redirect changes that. IOW, d->last should not be set just based on poe->numlower if redirects are enabled. That can lead to setting d->last while it should not have and that means we will not check for opaque xattr while we should have. So do this. - If redirects are not enabled, then continue to rely on poe->numlower information to determine if it is last layer or not. - If redirects are enabled, then set d->last = true only if this is the last layer in root ovl_entry (roe). Suggested-by: Amir Goldstein Reviewed-by: Amir Goldstein Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi Fixes: 02b69b284cd7 ("ovl: lookup redirects") Cc: #v4.10 --- fs/overlayfs/namei.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 70fcfcc684cc..03d8c5132477 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -815,7 +815,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, .is_dir = false, .opaque = false, .stop = false, - .last = !poe->numlower, + .last = ofs->config.redirect_follow ? false : !poe->numlower, .redirect = NULL, }; @@ -873,7 +873,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, for (i = 0; !d.stop && i < poe->numlower; i++) { struct ovl_path lower = poe->lowerstack[i]; - d.last = i == poe->numlower - 1; + if (!ofs->config.redirect_follow) + d.last = i == poe->numlower - 1; + else + d.last = lower.layer->idx == roe->numlower; + err = ovl_lookup_layer(lower.dentry, &d, &this); if (err) goto out_put; From 3ec9b3fafcaf441cc4d46b9742cd6ec0c79f8df0 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 12 Mar 2018 10:30:41 -0400 Subject: [PATCH 03/19] ovl: fix lookup with middle layer opaque dir and absolute path redirects As of now if we encounter an opaque dir while looking for a dentry, we set d->last=true. This means that there is no need to look further in any of the lower layers. This works fine as long as there are no redirets or relative redircts. But what if there is an absolute redirect on the children dentry of opaque directory. We still need to continue to look into next lower layer. This patch fixes it. Here is an example to demonstrate the issue. Say you have following setup. upper: /redirect (redirect=/a/b/c) lower1: /a/[b]/c ([b] is opaque) (c has absolute redirect=/a/b/d/) lower0: /a/b/d/foo Now "redirect" dir should merge with lower1:/a/b/c/ and lower0:/a/b/d. Note, despite the fact lower1:/a/[b] is opaque, we need to continue to look into lower0 because children c has an absolute redirect. Following is a reproducer. Watch me make foo disappear: $ mkdir lower middle upper work work2 merged $ mkdir lower/origin $ touch lower/origin/foo $ mount -t overlay none merged/ \ -olowerdir=lower,upperdir=middle,workdir=work2 $ mkdir merged/pure $ mv merged/origin merged/pure/redirect $ umount merged $ mount -t overlay none merged/ \ -olowerdir=middle:lower,upperdir=upper,workdir=work $ mv merged/pure/redirect merged/redirect Now you see foo inside a twice redirected merged dir: $ ls merged/redirect foo $ umount merged $ mount -t overlay none merged/ \ -olowerdir=middle:lower,upperdir=upper,workdir=work After mount cycle you don't see foo inside the same dir: $ ls merged/redirect During middle layer lookup, the opaqueness of middle/pure is left in the lookup state and then middle/pure/redirect is wrongly treated as opaque. Fixes: 02b69b284cd7 ("ovl: lookup redirects") Cc: #v4.10 Signed-off-by: Amir Goldstein Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/overlayfs/namei.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 03d8c5132477..35418317ecf2 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -56,6 +56,15 @@ static int ovl_check_redirect(struct dentry *dentry, struct ovl_lookup_data *d, if (s == next) goto invalid; } + /* + * One of the ancestor path elements in an absolute path + * lookup in ovl_lookup_layer() could have been opaque and + * that will stop further lookup in lower layers (d->stop=true) + * But we have found an absolute redirect in decendant path + * element and that should force continue lookup in lower + * layers (reset d->stop). + */ + d->stop = false; } else { if (strchr(buf, '/') != NULL) goto invalid; From 9f99e50d460ac7fd5f6c9b97aad0088c28c8656d Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 11 Apr 2018 20:09:29 +0300 Subject: [PATCH 04/19] ovl: set lower layer st_dev only if setting lower st_ino For broken hardlinks, we do not return lower st_ino, so we should also not return lower pseudo st_dev. Fixes: a0c5ad307ac0 ("ovl: relax same fs constraint for constant st_ino") Cc: #v4.15 Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/inode.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 4689716f23d8..1d75b2e96c96 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -118,13 +118,10 @@ int ovl_getattr(const struct path *path, struct kstat *stat, */ if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) || (!ovl_verify_lower(dentry->d_sb) && - (is_dir || lowerstat.nlink == 1))) + (is_dir || lowerstat.nlink == 1))) { stat->ino = lowerstat.ino; - - if (samefs) - WARN_ON_ONCE(stat->dev != lowerstat.dev); - else stat->dev = ovl_get_pseudo_dev(dentry); + } } if (samefs) { /* From 5b2cccd32c668de6bd1979545184cd7f0260f053 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 2 Feb 2018 10:42:03 +0200 Subject: [PATCH 05/19] ovl: disambiguate ovl_encode_fh() Rename ovl_encode_fh() to ovl_encode_real_fh() to differentiate from the exportfs function ovl_encode_inode_fh() and change the latter to ovl_encode_fh() to match the exportfs method name. Rename ovl_decode_fh() to ovl_decode_real_fh() for consistency. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/copy_up.c | 6 +++--- fs/overlayfs/export.c | 12 ++++++------ fs/overlayfs/namei.c | 10 +++++----- fs/overlayfs/overlayfs.h | 4 ++-- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index d855f508fa20..8bede0742619 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -232,7 +232,7 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) return err; } -struct ovl_fh *ovl_encode_fh(struct dentry *real, bool is_upper) +struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper) { struct ovl_fh *fh; int fh_type, fh_len, dwords; @@ -300,7 +300,7 @@ int ovl_set_origin(struct dentry *dentry, struct dentry *lower, * up and a pure upper inode. */ if (ovl_can_decode_fh(lower->d_sb)) { - fh = ovl_encode_fh(lower, false); + fh = ovl_encode_real_fh(lower, false); if (IS_ERR(fh)) return PTR_ERR(fh); } @@ -321,7 +321,7 @@ static int ovl_set_upper_fh(struct dentry *upper, struct dentry *index) const struct ovl_fh *fh; int err; - fh = ovl_encode_fh(upper, true); + fh = ovl_encode_real_fh(upper, true); if (IS_ERR(fh)) return PTR_ERR(fh); diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 87bd4148f4fb..e688cf019460 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -228,8 +228,8 @@ static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen) goto fail; /* Encode an upper or lower file handle */ - fh = ovl_encode_fh(enc_lower ? ovl_dentry_lower(dentry) : - ovl_dentry_upper(dentry), !enc_lower); + fh = ovl_encode_real_fh(enc_lower ? ovl_dentry_lower(dentry) : + ovl_dentry_upper(dentry), !enc_lower); err = PTR_ERR(fh); if (IS_ERR(fh)) goto fail; @@ -267,8 +267,8 @@ static int ovl_dentry_to_fh(struct dentry *dentry, u32 *fid, int *max_len) return OVL_FILEID; } -static int ovl_encode_inode_fh(struct inode *inode, u32 *fid, int *max_len, - struct inode *parent) +static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len, + struct inode *parent) { struct dentry *dentry; int type; @@ -685,7 +685,7 @@ static struct dentry *ovl_upper_fh_to_d(struct super_block *sb, if (!ofs->upper_mnt) return ERR_PTR(-EACCES); - upper = ovl_decode_fh(fh, ofs->upper_mnt); + upper = ovl_decode_real_fh(fh, ofs->upper_mnt); if (IS_ERR_OR_NULL(upper)) return upper; @@ -829,7 +829,7 @@ static struct dentry *ovl_get_parent(struct dentry *dentry) } const struct export_operations ovl_export_operations = { - .encode_fh = ovl_encode_inode_fh, + .encode_fh = ovl_encode_fh, .fh_to_dentry = ovl_fh_to_dentry, .fh_to_parent = ovl_fh_to_parent, .get_name = ovl_get_name, diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 35418317ecf2..24bd387321d1 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -180,7 +180,7 @@ invalid: goto out; } -struct dentry *ovl_decode_fh(struct ovl_fh *fh, struct vfsmount *mnt) +struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt) { struct dentry *real; int bytes; @@ -326,7 +326,7 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, int i; for (i = 0; i < ofs->numlower; i++) { - origin = ovl_decode_fh(fh, ofs->lower_layers[i].mnt); + origin = ovl_decode_real_fh(fh, ofs->lower_layers[i].mnt); if (origin) break; } @@ -424,7 +424,7 @@ int ovl_verify_set_fh(struct dentry *dentry, const char *name, struct ovl_fh *fh; int err; - fh = ovl_encode_fh(real, is_upper); + fh = ovl_encode_real_fh(real, is_upper); err = PTR_ERR(fh); if (IS_ERR(fh)) goto fail; @@ -460,7 +460,7 @@ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index) if (IS_ERR_OR_NULL(fh)) return ERR_CAST(fh); - upper = ovl_decode_fh(fh, ofs->upper_mnt); + upper = ovl_decode_real_fh(fh, ofs->upper_mnt); kfree(fh); if (IS_ERR_OR_NULL(upper)) @@ -628,7 +628,7 @@ int ovl_get_index_name(struct dentry *origin, struct qstr *name) struct ovl_fh *fh; int err; - fh = ovl_encode_fh(origin, false); + fh = ovl_encode_real_fh(origin, false); if (IS_ERR(fh)) return PTR_ERR(fh); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 225ff1171147..dd6c10e5a7db 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -266,7 +266,7 @@ static inline bool ovl_is_impuredir(struct dentry *dentry) /* namei.c */ int ovl_check_fh_len(struct ovl_fh *fh, int fh_len); -struct dentry *ovl_decode_fh(struct ovl_fh *fh, struct vfsmount *mnt); +struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt); int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, struct dentry *upperdentry, struct ovl_path **stackp); int ovl_verify_set_fh(struct dentry *dentry, const char *name, @@ -361,7 +361,7 @@ int ovl_copy_up(struct dentry *dentry); int ovl_copy_up_flags(struct dentry *dentry, int flags); int ovl_copy_xattr(struct dentry *old, struct dentry *new); int ovl_set_attr(struct dentry *upper, struct kstat *stat); -struct ovl_fh *ovl_encode_fh(struct dentry *real, bool is_upper); +struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper); int ovl_set_origin(struct dentry *dentry, struct dentry *lower, struct dentry *upper); From 8a22efa15b46d524577cac79da63cebca8e8307f Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 9 Mar 2018 15:51:02 +0200 Subject: [PATCH 06/19] ovl: do not try to reconnect a disconnected origin dentry On lookup of non directory, we try to decode the origin file handle stored in upper inode. The origin file handle is supposed to be decoded to a disconnected non-dir dentry, which is fine, because we only need the lower inode of a copy up origin. However, if the origin file handle somehow turns out to be a directory we pay the expensive cost of reconnecting the directory dentry, only to get a mismatch file type and drop the dentry. Optimize this case by explicitly opting out of reconnecting the dentry. Opting-out of reconnect is done by passing a NULL acceptable callback to exportfs_decode_fh(). While the case described above is a strange corner case that does not really need to be optimized, the API added for this optimization will be used by a following patch to optimize a more common case of decoding an overlayfs file handle. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/exportfs/expfs.c | 9 +++++++++ fs/overlayfs/export.c | 4 ++-- fs/overlayfs/namei.c | 16 +++++++++------- fs/overlayfs/overlayfs.h | 5 +++-- 4 files changed, 23 insertions(+), 11 deletions(-) diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 329a5d103846..645158dc33f1 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -435,6 +435,15 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, if (IS_ERR_OR_NULL(result)) return ERR_PTR(-ESTALE); + /* + * If no acceptance criteria was specified by caller, a disconnected + * dentry is also accepatable. Callers may use this mode to query if + * file handle is stale or to get a reference to an inode without + * risking the high overhead caused by directory reconnect. + */ + if (!acceptable) + return result; + if (d_is_dir(result)) { /* * This request is for a directory. diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index e688cf019460..15411350a7a1 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -685,7 +685,7 @@ static struct dentry *ovl_upper_fh_to_d(struct super_block *sb, if (!ofs->upper_mnt) return ERR_PTR(-EACCES); - upper = ovl_decode_real_fh(fh, ofs->upper_mnt); + upper = ovl_decode_real_fh(fh, ofs->upper_mnt, true); if (IS_ERR_OR_NULL(upper)) return upper; @@ -735,7 +735,7 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, } /* Then lookup origin by fh */ - err = ovl_check_origin_fh(ofs, fh, NULL, &stack); + err = ovl_check_origin_fh(ofs, fh, true, NULL, &stack); if (err) { goto out_err; } else if (index) { diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 24bd387321d1..810a333d2221 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -180,7 +180,8 @@ invalid: goto out; } -struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt) +struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, + bool connected) { struct dentry *real; int bytes; @@ -195,7 +196,7 @@ struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt) bytes = (fh->len - offsetof(struct ovl_fh, fid)); real = exportfs_decode_fh(mnt, (struct fid *)fh->fid, bytes >> 2, (int)fh->type, - ovl_acceptable, mnt); + connected ? ovl_acceptable : NULL, mnt); if (IS_ERR(real)) { /* * Treat stale file handle to lower file as "origin unknown". @@ -319,14 +320,15 @@ static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d, } -int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, +int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, struct dentry *upperdentry, struct ovl_path **stackp) { struct dentry *origin = NULL; int i; for (i = 0; i < ofs->numlower; i++) { - origin = ovl_decode_real_fh(fh, ofs->lower_layers[i].mnt); + origin = ovl_decode_real_fh(fh, ofs->lower_layers[i].mnt, + connected); if (origin) break; } @@ -370,7 +372,7 @@ static int ovl_check_origin(struct ovl_fs *ofs, struct dentry *upperdentry, if (IS_ERR_OR_NULL(fh)) return PTR_ERR(fh); - err = ovl_check_origin_fh(ofs, fh, upperdentry, stackp); + err = ovl_check_origin_fh(ofs, fh, false, upperdentry, stackp); kfree(fh); if (err) { @@ -460,7 +462,7 @@ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index) if (IS_ERR_OR_NULL(fh)) return ERR_CAST(fh); - upper = ovl_decode_real_fh(fh, ofs->upper_mnt); + upper = ovl_decode_real_fh(fh, ofs->upper_mnt, true); kfree(fh); if (IS_ERR_OR_NULL(upper)) @@ -567,7 +569,7 @@ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index) /* Check if non-dir index is orphan and don't warn before cleaning it */ if (!d_is_dir(index) && d_inode(index)->i_nlink == 1) { - err = ovl_check_origin_fh(ofs, fh, index, &stack); + err = ovl_check_origin_fh(ofs, fh, false, index, &stack); if (err) goto fail; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index dd6c10e5a7db..b51613b355c5 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -266,8 +266,9 @@ static inline bool ovl_is_impuredir(struct dentry *dentry) /* namei.c */ int ovl_check_fh_len(struct ovl_fh *fh, int fh_len); -struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt); -int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, +struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, + bool connected); +int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, struct dentry *upperdentry, struct ovl_path **stackp); int ovl_verify_set_fh(struct dentry *dentry, const char *name, struct dentry *real, bool is_upper, bool set); From 8b58924ad55c3a9fbeddd1a02d09fd29435e50eb Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 9 Mar 2018 17:05:55 +0200 Subject: [PATCH 07/19] ovl: lookup in inode cache first when decoding lower file handle When decoding a lower file handle, we need to check if lower file was copied up and indexed and if it has a whiteout index, we need to check if this is an unlinked but open non-dir before returning -ESTALE. To find out if this is an unlinked but open non-dir we need to lookup an overlay inode in inode cache by lower inode and that requires decoding the lower file handle before looking in inode cache. Before this change, if the lower inode turned out to be a directory, we may have paid an expensive cost to reconnect that lower directory for nothing. After this change, we start by decoding a disconnected lower dentry and using the lower inode for looking up an overlay inode in inode cache. If we find overlay inode and dentry in cache, we avoid the index lookup overhead. If we don't find an overlay inode and dentry in cache, then we only need to decode a connected lower dentry in case the lower dentry is a non-indexed directory. The xfstests group overlay/exportfs tests decoding overlayfs file handles after drop_caches with different states of the file at encode and decode time. Overall the tests in the group call ovl_lower_fh_to_d() 89 times to decode a lower file handle. Before this change, the tests called ovl_get_index_fh() 75 times and reconnect_one() 61 times. After this change, the tests call ovl_get_index_fh() 70 times and reconnect_one() 59 times. The 2 cases where reconnect_one() was avoided are cases where a non-upper directory file handle was encoded, then the directory removed and then file handle was decoded. To demonstrate the affect on decoding file handles with hot inode/dentry cache, the drop_caches call in the tests was disabled. Without drop_caches, there are no reconnect_one() calls at all before or after the change. Before the change, there are 75 calls to ovl_get_index_fh(), exactly as the case with drop_caches. After the change, there are only 10 calls to ovl_get_index_fh(). Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/export.c | 58 ++++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 15411350a7a1..ed73e363c7b1 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -703,25 +703,39 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, struct ovl_path *stack = &origin; struct dentry *dentry = NULL; struct dentry *index = NULL; - struct inode *inode = NULL; - bool is_deleted = false; + struct inode *inode; int err; - /* First lookup indexed upper by fh */ + /* First lookup overlay inode in inode cache by origin fh */ + err = ovl_check_origin_fh(ofs, fh, false, NULL, &stack); + if (err) + return ERR_PTR(err); + + if (!d_is_dir(origin.dentry) || + !(origin.dentry->d_flags & DCACHE_DISCONNECTED)) { + inode = ovl_lookup_inode(sb, origin.dentry, false); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_err; + if (inode) { + dentry = d_find_any_alias(inode); + iput(inode); + if (dentry) + goto out; + } + } + + /* Then lookup indexed upper/whiteout by origin fh */ if (ofs->indexdir) { index = ovl_get_index_fh(ofs, fh); err = PTR_ERR(index); if (IS_ERR(index)) { - if (err != -ESTALE) - return ERR_PTR(err); - - /* Found a whiteout index - treat as deleted inode */ - is_deleted = true; index = NULL; + goto out_err; } } - /* Then try to get upper dir by index */ + /* Then try to get a connected upper dir by index */ if (index && d_is_dir(index)) { struct dentry *upper = ovl_index_upper(ofs, index); @@ -734,24 +748,19 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, goto out; } - /* Then lookup origin by fh */ - err = ovl_check_origin_fh(ofs, fh, true, NULL, &stack); - if (err) { - goto out_err; - } else if (index) { + /* Otherwise, get a connected non-upper dir or disconnected non-dir */ + if (d_is_dir(origin.dentry) && + (origin.dentry->d_flags & DCACHE_DISCONNECTED)) { + dput(origin.dentry); + origin.dentry = NULL; + err = ovl_check_origin_fh(ofs, fh, true, NULL, &stack); + if (err) + goto out_err; + } + if (index) { err = ovl_verify_origin(index, origin.dentry, false); if (err) goto out_err; - } else if (is_deleted) { - /* Lookup deleted non-dir by origin inode */ - if (!d_is_dir(origin.dentry)) - inode = ovl_lookup_inode(sb, origin.dentry, false); - err = -ESTALE; - if (!inode || atomic_read(&inode->i_count) == 1) - goto out_err; - - /* Deleted but still open? */ - index = dget(ovl_i_dentry_upper(inode)); } dentry = ovl_get_dentry(sb, NULL, &origin, index); @@ -759,7 +768,6 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, out: dput(origin.dentry); dput(index); - iput(inode); return dentry; out_err: From e9b77f90cc2307f6d2457696d3d76288a5df11fc Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 9 Mar 2018 15:44:42 -0500 Subject: [PATCH 08/19] ovl: Do not check for redirect if this is last layer If we are looking in last layer, then there should not be any need to process redirect. redirect information is used only for lookup in next lower layer and there is no more lower layer to look into. So no need to process redirects. IOW, ignore redirects on lowest layer. Signed-off-by: Vivek Goyal Reviewed-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/namei.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 810a333d2221..5287c4bcf14b 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -258,7 +258,10 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, goto out; } d->is_dir = true; - if (!d->last && ovl_is_opaquedir(this)) { + if (d->last) + goto out; + + if (ovl_is_opaquedir(this)) { d->stop = d->opaque = true; goto out; } From 102b0d11cbe83e607a8e9060c05930905332f9ec Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 9 Mar 2018 15:44:43 -0500 Subject: [PATCH 09/19] ovl: set d->is_dir and d->opaque for last path element Certain properties in ovl_lookup_data should be set only for the last element of the path. IOW, if we are calling ovl_lookup_single() for an absolute redirect, then d->is_dir and d->opaque do not make much sense for intermediate path elements. Instead set them only if dentry being lookup is last path element. As of now we do not seem to be making use of d->opaque if it is set for a path/dentry in lower. But just define the semantics so that future code can make use of this assumption. Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/overlayfs/namei.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 5287c4bcf14b..0a26b76590d5 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -230,6 +230,7 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, { struct dentry *this; int err; + bool last_element = !post[0]; this = lookup_one_len_unlocked(name, base, namelen); if (IS_ERR(this)) { @@ -257,12 +258,15 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, goto put_and_out; goto out; } - d->is_dir = true; + if (last_element) + d->is_dir = true; if (d->last) goto out; if (ovl_is_opaquedir(this)) { - d->stop = d->opaque = true; + d->stop = true; + if (last_element) + d->opaque = true; goto out; } err = ovl_check_redirect(this, d, prelen, post); From 0471a9cdb00ff4a84e781c6dc4b0f87297bac1ed Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Tue, 20 Mar 2018 16:35:40 -0400 Subject: [PATCH 10/19] ovl: cleanup setting OVL_INDEX Signed-off-by: Miklos Szeredi --- fs/overlayfs/export.c | 3 --- fs/overlayfs/inode.c | 3 +++ fs/overlayfs/namei.c | 2 -- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index ed73e363c7b1..bb290cd1e727 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -311,9 +311,6 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb, return ERR_CAST(inode); } - if (index) - ovl_set_flag(OVL_INDEX, inode); - dentry = d_find_any_alias(inode); if (!dentry) { dentry = d_alloc_anon(inode->i_sb); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 1d75b2e96c96..b3be0cfd4c92 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -770,6 +770,9 @@ struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, if (upperdentry && ovl_is_impuredir(upperdentry)) ovl_set_flag(OVL_IMPURE, inode); + if (index) + ovl_set_flag(OVL_INDEX, inode); + /* Check for non-merge dir that may have whiteouts */ if (is_dir) { if (((upperdentry && lowerdentry) || numlower > 1) || diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 0a26b76590d5..72230b759b58 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -1007,8 +1007,6 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, goto out_free_oe; OVL_I(inode)->redirect = upperredirect; - if (index) - ovl_set_flag(OVL_INDEX, inode); } revert_creds(old_cred); From 3a291774d17e096950854506d8869dfe950ec932 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 12 Apr 2018 12:04:49 +0200 Subject: [PATCH 11/19] ovl: add WARN_ON() for non-dir redirect cases Signed-off-by: Miklos Szeredi --- fs/overlayfs/namei.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 72230b759b58..52f0b13be633 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -256,6 +256,12 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, d->stop = true; if (d->is_dir) goto put_and_out; + + /* + * NB: handle failure to lookup non-last element when non-dir + * redirects become possible + */ + WARN_ON(!last_element); goto out; } if (last_element) @@ -1006,6 +1012,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_free_oe; + /* + * NB: handle redirected hard links when non-dir redirects + * become possible + */ + WARN_ON(OVL_I(inode)->redirect); OVL_I(inode)->redirect = upperredirect; } From 8f35cf51cd24a08e3d5b97e7253c93a5c90a4c1e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 12 Apr 2018 12:04:50 +0200 Subject: [PATCH 12/19] ovl: cleanup ovl_update_time() No need to mess with an alias, the upperdentry can be retrieved directly from the overlay inode. Signed-off-by: Miklos Szeredi --- fs/overlayfs/inode.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index b3be0cfd4c92..29069f29b3a6 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -380,24 +380,18 @@ int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags) int ovl_update_time(struct inode *inode, struct timespec *ts, int flags) { - struct dentry *alias; - struct path upperpath; + if (flags & S_ATIME) { + struct ovl_fs *ofs = inode->i_sb->s_fs_info; + struct path upperpath = { + .mnt = ofs->upper_mnt, + .dentry = ovl_upperdentry_dereference(OVL_I(inode)), + }; - if (!(flags & S_ATIME)) - return 0; - - alias = d_find_any_alias(inode); - if (!alias) - return 0; - - ovl_path_upper(alias, &upperpath); - if (upperpath.dentry) { - touch_atime(&upperpath); - inode->i_atime = d_inode(upperpath.dentry)->i_atime; + if (upperpath.dentry) { + touch_atime(&upperpath); + inode->i_atime = d_inode(upperpath.dentry)->i_atime; + } } - - dput(alias); - return 0; } From da309e8c055de8d6461ae01764a3352c77878735 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 8 Nov 2017 19:39:51 +0200 Subject: [PATCH 13/19] ovl: factor out ovl_map_dev_ino() helper A helper for ovl_getattr() to map the values of st_dev and st_ino according to constant st_ino rules. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/inode.c | 88 ++++++++++++++++++++++------------------ fs/overlayfs/overlayfs.h | 1 + fs/overlayfs/util.c | 7 ++++ 3 files changed, 57 insertions(+), 39 deletions(-) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 29069f29b3a6..8a283ec21e34 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -16,13 +16,6 @@ #include "overlayfs.h" -static dev_t ovl_get_pseudo_dev(struct dentry *dentry) -{ - struct ovl_entry *oe = dentry->d_fsdata; - - return oe->lowerstack[0].layer->pseudo_dev; -} - int ovl_setattr(struct dentry *dentry, struct iattr *attr) { int err; @@ -66,6 +59,43 @@ out: return err; } +static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, + struct ovl_layer *lower_layer) +{ + bool samefs = ovl_same_sb(dentry->d_sb); + + if (samefs) { + /* + * When all layers are on the same fs, all real inode + * number are unique, so we use the overlay st_dev, + * which is friendly to du -x. + */ + stat->dev = dentry->d_sb->s_dev; + } else if (S_ISDIR(dentry->d_inode->i_mode)) { + /* + * Always use the overlay st_dev for directories, so 'find + * -xdev' will scan the entire overlay mount and won't cross the + * overlay mount boundaries. + * + * If not all layers are on the same fs the pair {real st_ino; + * overlay st_dev} is not unique, so use the non persistent + * overlay st_ino for directories. + */ + stat->dev = dentry->d_sb->s_dev; + stat->ino = dentry->d_inode->i_ino; + } else if (lower_layer) { + /* + * For non-samefs setup, if we cannot map all layers st_ino + * to a unified address space, we need to make sure that st_dev + * is unique per layer. Upper layer uses real st_dev and lower + * layers use the unique anonymous bdev. + */ + stat->dev = lower_layer->pseudo_dev; + } + + return 0; +} + int ovl_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { @@ -75,6 +105,7 @@ int ovl_getattr(const struct path *path, struct kstat *stat, const struct cred *old_cred; bool is_dir = S_ISDIR(dentry->d_inode->i_mode); bool samefs = ovl_same_sb(dentry->d_sb); + struct ovl_layer *lower_layer = NULL; int err; type = ovl_path_real(dentry, &realpath); @@ -84,14 +115,16 @@ int ovl_getattr(const struct path *path, struct kstat *stat, goto out; /* - * For non-dir or same fs, we use st_ino of the copy up origin, if we - * know it. This guaranties constant st_dev/st_ino across copy up. + * For non-dir or same fs, we use st_ino of the copy up origin. + * This guaranties constant st_dev/st_ino across copy up. * - * If filesystem supports NFS export ops, this also guaranties + * If lower filesystem supports NFS file handles, this also guaranties * persistent st_ino across mount cycle. */ if (!is_dir || samefs) { - if (OVL_TYPE_ORIGIN(type)) { + if (!OVL_TYPE_UPPER(type)) { + lower_layer = ovl_layer_lower(dentry); + } else if (OVL_TYPE_ORIGIN(type)) { struct kstat lowerstat; u32 lowermask = STATX_INO | (!is_dir ? STATX_NLINK : 0); @@ -120,38 +153,15 @@ int ovl_getattr(const struct path *path, struct kstat *stat, (!ovl_verify_lower(dentry->d_sb) && (is_dir || lowerstat.nlink == 1))) { stat->ino = lowerstat.ino; - stat->dev = ovl_get_pseudo_dev(dentry); + lower_layer = ovl_layer_lower(dentry); } } - if (samefs) { - /* - * When all layers are on the same fs, all real inode - * number are unique, so we use the overlay st_dev, - * which is friendly to du -x. - */ - stat->dev = dentry->d_sb->s_dev; - } else if (!OVL_TYPE_UPPER(type)) { - /* - * For non-samefs setup, to make sure that st_dev/st_ino - * pair is unique across the system, we use a unique - * anonymous st_dev for lower layer inode. - */ - stat->dev = ovl_get_pseudo_dev(dentry); - } - } else { - /* - * Always use the overlay st_dev for directories, so 'find - * -xdev' will scan the entire overlay mount and won't cross the - * overlay mount boundaries. - * - * If not all layers are on the same fs the pair {real st_ino; - * overlay st_dev} is not unique, so use the non persistent - * overlay st_ino for directories. - */ - stat->dev = dentry->d_sb->s_dev; - stat->ino = dentry->d_inode->i_ino; } + err = ovl_map_dev_ino(dentry, stat, lower_layer); + if (err) + goto out; + /* * It's probably not worth it to count subdirs to get the * correct link count. nlink=1 seems to pacify 'find' and diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index b51613b355c5..09779f9111a8 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -215,6 +215,7 @@ void ovl_path_lower(struct dentry *dentry, struct path *path); enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); struct dentry *ovl_dentry_upper(struct dentry *dentry); struct dentry *ovl_dentry_lower(struct dentry *dentry); +struct ovl_layer *ovl_layer_lower(struct dentry *dentry); struct dentry *ovl_dentry_real(struct dentry *dentry); struct dentry *ovl_i_dentry_upper(struct inode *inode); struct inode *ovl_inode_upper(struct inode *inode); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 493f9b76fbf6..5042293d2078 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -172,6 +172,13 @@ struct dentry *ovl_dentry_lower(struct dentry *dentry) return oe->numlower ? oe->lowerstack[0].dentry : NULL; } +struct ovl_layer *ovl_layer_lower(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->numlower ? oe->lowerstack[0].layer : NULL; +} + struct dentry *ovl_dentry_real(struct dentry *dentry) { return ovl_dentry_upper(dentry) ?: ovl_dentry_lower(dentry); From 5148626b806a74dd219f2bce5f204abf909f6930 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 28 Mar 2018 20:22:41 +0300 Subject: [PATCH 14/19] ovl: allocate anon bdev per unique lower fs Instead of allocating an anonymous bdev per lower layer, allocate one anonymous bdev per every unique lower fs that is different than upper fs. Every unique lower fs is assigned an fsid > 0 and the number of unique lower fs are stored in ofs->numlowerfs. The assigned fsid is stored in the lower layer struct and will be used also for inode number multiplexing. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/inode.c | 9 +++--- fs/overlayfs/ovl_entry.h | 18 ++++++++--- fs/overlayfs/super.c | 66 +++++++++++++++++++++++++++++----------- fs/overlayfs/util.c | 7 ++++- 4 files changed, 72 insertions(+), 28 deletions(-) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 8a283ec21e34..cfccd91c51e3 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -83,14 +83,15 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, */ stat->dev = dentry->d_sb->s_dev; stat->ino = dentry->d_inode->i_ino; - } else if (lower_layer) { + } else if (lower_layer && lower_layer->fsid) { /* * For non-samefs setup, if we cannot map all layers st_ino * to a unified address space, we need to make sure that st_dev - * is unique per layer. Upper layer uses real st_dev and lower - * layers use the unique anonymous bdev. + * is unique per lower fs. Upper layer uses real st_dev and + * lower layers use the unique anonymous bdev assigned to the + * lower fs. */ - stat->dev = lower_layer->pseudo_dev; + stat->dev = lower_layer->fs->pseudo_dev; } return 0; diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index bfef6edcc111..e1c838c27a74 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -20,11 +20,18 @@ struct ovl_config { bool nfs_export; }; +struct ovl_sb { + struct super_block *sb; + dev_t pseudo_dev; +}; + struct ovl_layer { struct vfsmount *mnt; - dev_t pseudo_dev; - /* Index of this layer in fs root (upper == 0) */ + struct ovl_sb *fs; + /* Index of this layer in fs root (upper idx == 0) */ int idx; + /* One fsid per unique underlying sb (upper fsid == 0) */ + int fsid; }; struct ovl_path { @@ -35,8 +42,11 @@ struct ovl_path { /* private information held for overlayfs's superblock */ struct ovl_fs { struct vfsmount *upper_mnt; - unsigned numlower; + unsigned int numlower; + /* Number of unique lower sb that differ from upper sb */ + unsigned int numlowerfs; struct ovl_layer *lower_layers; + struct ovl_sb *lower_fs; /* workbasedir is the path at workdir= mount option */ struct dentry *workbasedir; /* workdir is the 'work' directory under workbasedir */ @@ -50,8 +60,6 @@ struct ovl_fs { const struct cred *creator_cred; bool tmpfile; bool noxattr; - /* sb common to all layers */ - struct super_block *same_sb; /* Did we take the inuse lock? */ bool upperdir_locked; bool workdir_locked; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 7c24619ae7fc..7d97d30cad39 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -236,11 +236,12 @@ static void ovl_free_fs(struct ovl_fs *ofs) if (ofs->upperdir_locked) ovl_inuse_unlock(ofs->upper_mnt->mnt_root); mntput(ofs->upper_mnt); - for (i = 0; i < ofs->numlower; i++) { + for (i = 0; i < ofs->numlower; i++) mntput(ofs->lower_layers[i].mnt); - free_anon_bdev(ofs->lower_layers[i].pseudo_dev); - } + for (i = 0; i < ofs->numlowerfs; i++) + free_anon_bdev(ofs->lower_fs[i].pseudo_dev); kfree(ofs->lower_layers); + kfree(ofs->lower_fs); kfree(ofs->config.lowerdir); kfree(ofs->config.upperdir); @@ -1108,6 +1109,35 @@ out: return err; } +/* Get a unique fsid for the layer */ +static int ovl_get_fsid(struct ovl_fs *ofs, struct super_block *sb) +{ + unsigned int i; + dev_t dev; + int err; + + /* fsid 0 is reserved for upper fs even with non upper overlay */ + if (ofs->upper_mnt && ofs->upper_mnt->mnt_sb == sb) + return 0; + + for (i = 0; i < ofs->numlowerfs; i++) { + if (ofs->lower_fs[i].sb == sb) + return i + 1; + } + + err = get_anon_bdev(&dev); + if (err) { + pr_err("overlayfs: failed to get anonymous bdev for lowerpath\n"); + return err; + } + + ofs->lower_fs[ofs->numlowerfs].sb = sb; + ofs->lower_fs[ofs->numlowerfs].pseudo_dev = dev; + ofs->numlowerfs++; + + return ofs->numlowerfs; +} + static int ovl_get_lower_layers(struct ovl_fs *ofs, struct path *stack, unsigned int numlower) { @@ -1119,23 +1149,27 @@ static int ovl_get_lower_layers(struct ovl_fs *ofs, struct path *stack, GFP_KERNEL); if (ofs->lower_layers == NULL) goto out; + + ofs->lower_fs = kcalloc(numlower, sizeof(struct ovl_sb), + GFP_KERNEL); + if (ofs->lower_fs == NULL) + goto out; + for (i = 0; i < numlower; i++) { struct vfsmount *mnt; - dev_t dev; + int fsid; - err = get_anon_bdev(&dev); - if (err) { - pr_err("overlayfs: failed to get anonymous bdev for lowerpath\n"); + err = fsid = ovl_get_fsid(ofs, stack[i].mnt->mnt_sb); + if (err < 0) goto out; - } mnt = clone_private_mount(&stack[i]); err = PTR_ERR(mnt); if (IS_ERR(mnt)) { pr_err("overlayfs: failed to clone lowerpath\n"); - free_anon_bdev(dev); goto out; } + /* * Make lower layers R/O. That way fchmod/fchown on lower file * will fail instead of modifying lower fs. @@ -1143,15 +1177,13 @@ static int ovl_get_lower_layers(struct ovl_fs *ofs, struct path *stack, mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME; ofs->lower_layers[ofs->numlower].mnt = mnt; - ofs->lower_layers[ofs->numlower].pseudo_dev = dev; ofs->lower_layers[ofs->numlower].idx = i + 1; + ofs->lower_layers[ofs->numlower].fsid = fsid; + if (fsid) { + ofs->lower_layers[ofs->numlower].fs = + &ofs->lower_fs[fsid - 1]; + } ofs->numlower++; - - /* Check if all lower layers are on same sb */ - if (i == 0) - ofs->same_sb = mnt->mnt_sb; - else if (ofs->same_sb != mnt->mnt_sb) - ofs->same_sb = NULL; } err = 0; out: @@ -1305,8 +1337,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) /* If the upper fs is nonexistent, we mark overlayfs r/o too */ if (!ofs->upper_mnt) sb->s_flags |= SB_RDONLY; - else if (ofs->upper_mnt->mnt_sb != ofs->same_sb) - ofs->same_sb = NULL; if (!(ovl_force_readonly(ofs)) && ofs->config.index) { err = ovl_get_indexdir(ofs, oe, &upperpath); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 5042293d2078..9dfec74ec77d 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -47,7 +47,12 @@ struct super_block *ovl_same_sb(struct super_block *sb) { struct ovl_fs *ofs = sb->s_fs_info; - return ofs->same_sb; + if (!ofs->numlowerfs) + return ofs->upper_mnt->mnt_sb; + else if (ofs->numlowerfs == 1 && !ofs->upper_mnt) + return ofs->lower_fs[0].sb; + else + return NULL; } bool ovl_can_decode_fh(struct super_block *sb) From e487d889b7e3e8ec4091eb83bc4f7e67c7f05e27 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 7 Nov 2017 13:55:04 +0200 Subject: [PATCH 15/19] ovl: constant st_ino for non-samefs with xino On 64bit systems, when overlay layers are not all on the same fs, but all inode numbers of underlying fs are not using the high bits, use the high bits to partition the overlay st_ino address space. The high bits hold the fsid (upper fsid is 0). This way overlay inode numbers are unique and all inodes use overlay st_dev. Inode numbers are also persistent for a given layer configuration. Currently, our only indication for available high ino bits is from a filesystem that supports file handles and uses the default encode_fh() operation, which encodes a 32bit inode number. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/inode.c | 31 +++++++++++++++++++++++++++++-- fs/overlayfs/overlayfs.h | 9 ++++++++- fs/overlayfs/ovl_entry.h | 2 ++ fs/overlayfs/super.c | 26 ++++++++++++++++++++++---- fs/overlayfs/util.c | 17 ++++++++++++++--- 5 files changed, 75 insertions(+), 10 deletions(-) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index cfccd91c51e3..51d780898d89 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -63,6 +63,7 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, struct ovl_layer *lower_layer) { bool samefs = ovl_same_sb(dentry->d_sb); + unsigned int xinobits = ovl_xino_bits(dentry->d_sb); if (samefs) { /* @@ -71,7 +72,31 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, * which is friendly to du -x. */ stat->dev = dentry->d_sb->s_dev; - } else if (S_ISDIR(dentry->d_inode->i_mode)) { + return 0; + } else if (xinobits) { + unsigned int shift = 64 - xinobits; + /* + * All inode numbers of underlying fs should not be using the + * high xinobits, so we use high xinobits to partition the + * overlay st_ino address space. The high bits holds the fsid + * (upper fsid is 0). This way overlay inode numbers are unique + * and all inodes use overlay st_dev. Inode numbers are also + * persistent for a given layer configuration. + */ + if (stat->ino >> shift) { + pr_warn_ratelimited("overlayfs: inode number too big (%pd2, ino=%llu, xinobits=%d)\n", + dentry, stat->ino, xinobits); + } else { + if (lower_layer) + stat->ino |= ((u64)lower_layer->fsid) << shift; + + stat->dev = dentry->d_sb->s_dev; + return 0; + } + } + + /* The inode could not be mapped to a unified st_ino address space */ + if (S_ISDIR(dentry->d_inode->i_mode)) { /* * Always use the overlay st_dev for directories, so 'find * -xdev' will scan the entire overlay mount and won't cross the @@ -118,11 +143,13 @@ int ovl_getattr(const struct path *path, struct kstat *stat, /* * For non-dir or same fs, we use st_ino of the copy up origin. * This guaranties constant st_dev/st_ino across copy up. + * With xino feature and non-samefs, we use st_ino of the copy up + * origin masked with high bits that represent the layer id. * * If lower filesystem supports NFS file handles, this also guaranties * persistent st_ino across mount cycle. */ - if (!is_dir || samefs) { + if (!is_dir || samefs || ovl_xino_bits(dentry->d_sb)) { if (!OVL_TYPE_UPPER(type)) { lower_layer = ovl_layer_lower(dentry); } else if (OVL_TYPE_ORIGIN(type)) { diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 09779f9111a8..8eefc309aeb6 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -202,7 +202,7 @@ void ovl_drop_write(struct dentry *dentry); struct dentry *ovl_workdir(struct dentry *dentry); const struct cred *ovl_override_creds(struct super_block *sb); struct super_block *ovl_same_sb(struct super_block *sb); -bool ovl_can_decode_fh(struct super_block *sb); +int ovl_can_decode_fh(struct super_block *sb); struct dentry *ovl_indexdir(struct super_block *sb); bool ovl_index_all(struct super_block *sb); bool ovl_verify_lower(struct super_block *sb); @@ -264,6 +264,13 @@ static inline bool ovl_is_impuredir(struct dentry *dentry) return ovl_check_dir_xattr(dentry, OVL_XATTR_IMPURE); } +static inline unsigned int ovl_xino_bits(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + return ofs->xino_bits; +} + /* namei.c */ int ovl_check_fh_len(struct ovl_fh *fh, int fh_len); diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index e1c838c27a74..620bd20f9a22 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -63,6 +63,8 @@ struct ovl_fs { /* Did we take the inuse lock? */ bool upperdir_locked; bool workdir_locked; + /* Inode numbers in all layers do not use the high xino_bits */ + unsigned int xino_bits; }; /* private information held for every overlayfs dentry */ diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 7d97d30cad39..d7284444f404 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "overlayfs.h" MODULE_AUTHOR("Miklos Szeredi "); @@ -701,6 +702,7 @@ static int ovl_check_namelen(struct path *path, struct ovl_fs *ofs, static int ovl_lower_dir(const char *name, struct path *path, struct ovl_fs *ofs, int *stack_depth, bool *remote) { + int fh_type; int err; err = ovl_mount_dir_noesc(name, path); @@ -720,15 +722,19 @@ static int ovl_lower_dir(const char *name, struct path *path, * The inodes index feature and NFS export need to encode and decode * file handles, so they require that all layers support them. */ + fh_type = ovl_can_decode_fh(path->dentry->d_sb); if ((ofs->config.nfs_export || - (ofs->config.index && ofs->config.upperdir)) && - !ovl_can_decode_fh(path->dentry->d_sb)) { + (ofs->config.index && ofs->config.upperdir)) && !fh_type) { ofs->config.index = false; ofs->config.nfs_export = false; pr_warn("overlayfs: fs on '%s' does not support file handles, falling back to index=off,nfs_export=off.\n", name); } + /* Check if lower fs has 32bit inode numbers */ + if (fh_type != FILEID_INO32_GEN) + ofs->xino_bits = 0; + return 0; out_put: @@ -952,6 +958,7 @@ static int ovl_make_workdir(struct ovl_fs *ofs, struct path *workpath) { struct vfsmount *mnt = ofs->upper_mnt; struct dentry *temp; + int fh_type; int err; err = mnt_want_write(mnt); @@ -1001,12 +1008,16 @@ static int ovl_make_workdir(struct ovl_fs *ofs, struct path *workpath) } /* Check if upper/work fs supports file handles */ - if (ofs->config.index && - !ovl_can_decode_fh(ofs->workdir->d_sb)) { + fh_type = ovl_can_decode_fh(ofs->workdir->d_sb); + if (ofs->config.index && !fh_type) { ofs->config.index = false; pr_warn("overlayfs: upper fs does not support file handles, falling back to index=off.\n"); } + /* Check if upper fs has 32bit inode numbers */ + if (fh_type != FILEID_INO32_GEN) + ofs->xino_bits = 0; + /* NFS export of r/w mount depends on index */ if (ofs->config.nfs_export && !ofs->config.index) { pr_warn("overlayfs: NFS export requires \"index=on\", falling back to nfs_export=off.\n"); @@ -1185,6 +1196,11 @@ static int ovl_get_lower_layers(struct ovl_fs *ofs, struct path *stack, } ofs->numlower++; } + + /* When all layers on same fs, overlay can use real inode numbers */ + if (!ofs->numlowerfs || (ofs->numlowerfs == 1 && !ofs->upper_mnt)) + ofs->xino_bits = 0; + err = 0; out: return err; @@ -1308,6 +1324,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_stack_depth = 0; sb->s_maxbytes = MAX_LFS_FILESIZE; + /* Assume underlaying fs uses 32bit inodes unless proven otherwise */ + ofs->xino_bits = BITS_PER_LONG - 32; if (ofs->config.upperdir) { if (!ofs->config.workdir) { pr_err("overlayfs: missing 'workdir'\n"); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 9dfec74ec77d..6f1078028c66 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -55,10 +55,21 @@ struct super_block *ovl_same_sb(struct super_block *sb) return NULL; } -bool ovl_can_decode_fh(struct super_block *sb) +/* + * Check if underlying fs supports file handles and try to determine encoding + * type, in order to deduce maximum inode number used by fs. + * + * Return 0 if file handles are not supported. + * Return 1 (FILEID_INO32_GEN) if fs uses the default 32bit inode encoding. + * Return -1 if fs uses a non default encoding with unknown inode size. + */ +int ovl_can_decode_fh(struct super_block *sb) { - return (sb->s_export_op && sb->s_export_op->fh_to_dentry && - !uuid_is_null(&sb->s_uuid)); + if (!sb->s_export_op || !sb->s_export_op->fh_to_dentry || + uuid_is_null(&sb->s_uuid)) + return 0; + + return sb->s_export_op->encode_fh ? -1 : FILEID_INO32_GEN; } struct dentry *ovl_indexdir(struct super_block *sb) From 12574a9f4c9cc9d8d6fd9078cbb8ec7d3e9ed46b Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 16 Mar 2018 10:39:37 +0200 Subject: [PATCH 16/19] ovl: consistent i_ino for non-samefs with xino When overlay layers are not all on the same fs, but all inode numbers of underlying fs do not use the high 'xino' bits, overlay st_ino values are constant and persistent. In that case, set i_ino value to the same value as st_ino for nfsd readdirplus validator. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/export.c | 2 +- fs/overlayfs/inode.c | 27 ++++++++++++++++++--------- fs/overlayfs/namei.c | 4 +--- fs/overlayfs/overlayfs.h | 2 +- 4 files changed, 21 insertions(+), 14 deletions(-) diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index bb290cd1e727..425a94672300 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -305,7 +305,7 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb, if (d_is_dir(upper ?: lower)) return ERR_PTR(-EIO); - inode = ovl_get_inode(sb, dget(upper), lower, index, !!lower); + inode = ovl_get_inode(sb, dget(upper), lowerpath, index, !!lower); if (IS_ERR(inode)) { dput(upper); return ERR_CAST(inode); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 51d780898d89..6e3815fb006b 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -489,19 +489,26 @@ static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode) } static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev, - unsigned long ino) + unsigned long ino, int fsid) { + int xinobits = ovl_xino_bits(inode->i_sb); + /* * When NFS export is enabled and d_ino is consistent with st_ino - * (samefs), set the same value to i_ino, because nfsd readdirplus - * compares d_ino values to i_ino values of child entries. When called - * from ovl_new_inode(), ino arg is 0, so i_ino will be updated to real + * (samefs or i_ino has enough bits to encode layer), set the same + * value used for d_ino to i_ino, because nfsd readdirplus compares + * d_ino values to i_ino values of child entries. When called from + * ovl_new_inode(), ino arg is 0, so i_ino will be updated to real * upper inode i_ino on ovl_inode_init() or ovl_inode_update(). */ - if (inode->i_sb->s_export_op && ovl_same_sb(inode->i_sb)) + if (inode->i_sb->s_export_op && + (ovl_same_sb(inode->i_sb) || xinobits)) { inode->i_ino = ino; - else + if (xinobits && fsid && !(ino >> (64 - xinobits))) + inode->i_ino |= (unsigned long)fsid << (64 - xinobits); + } else { inode->i_ino = get_next_ino(); + } inode->i_mode = mode; inode->i_flags |= S_NOCMTIME; #ifdef CONFIG_FS_POSIX_ACL @@ -637,7 +644,7 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev) inode = new_inode(sb); if (inode) - ovl_fill_inode(inode, mode, rdev, 0); + ovl_fill_inode(inode, mode, rdev, 0, 0); return inode; } @@ -743,12 +750,14 @@ static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper, } struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, - struct dentry *lowerdentry, struct dentry *index, + struct ovl_path *lowerpath, struct dentry *index, unsigned int numlower) { struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL; struct inode *inode; + struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL; bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, index); + int fsid = bylower ? lowerpath->layer->fsid : 0; bool is_dir; unsigned long ino = 0; @@ -796,7 +805,7 @@ struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, if (!inode) goto out_nomem; } - ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev, ino); + ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev, ino, fsid); ovl_inode_init(inode, upperdentry, lowerdentry); if (upperdentry && ovl_is_impuredir(upperdentry)) diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 52f0b13be633..2dba29eadde6 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -1004,9 +1004,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, upperdentry = dget(index); if (upperdentry || ctr) { - if (ctr) - origin = stack[0].dentry; - inode = ovl_get_inode(dentry->d_sb, upperdentry, origin, index, + inode = ovl_get_inode(dentry->d_sb, upperdentry, stack, index, ctr); err = PTR_ERR(inode); if (IS_ERR(inode)) diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 8eefc309aeb6..e0b7de799f6b 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -338,7 +338,7 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev); struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real, bool is_upper); struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, - struct dentry *lowerdentry, struct dentry *index, + struct ovl_path *lowerpath, struct dentry *index, unsigned int numlower); static inline void ovl_copyattr(struct inode *from, struct inode *to) { From adbf4f7ea834671c9d12002136c8162b34e011d5 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 6 Nov 2017 16:48:02 +0200 Subject: [PATCH 17/19] ovl: consistent d_ino for non-samefs with xino When overlay layers are not all on the same fs, but all inode numbers of underlying fs do not use the high 'xino' bits, overlay st_ino values are constant and persistent. In that case, relax non-samefs constraint for consistent d_ino and always iterate non-merge dir using ovl_fill_real() actor so we can remap lower inode numbers to unique lower fs range. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/readdir.c | 45 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index c11f5c0906c3..ef1fe42ff7bb 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -120,6 +120,10 @@ static bool ovl_calc_d_ino(struct ovl_readdir_data *rdd, if (!rdd->dentry) return false; + /* Always recalc d_ino when remapping lower inode numbers */ + if (ovl_xino_bits(rdd->dentry->d_sb)) + return true; + /* Always recalc d_ino for parent */ if (strcmp(p->name, "..") == 0) return true; @@ -435,6 +439,19 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) return cache; } +/* Map inode number to lower fs unique range */ +static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid, + const char *name, int namelen) +{ + if (ino >> (64 - xinobits)) { + pr_warn_ratelimited("overlayfs: d_ino too big (%.*s, ino=%llu, xinobits=%d)\n", + namelen, name, ino, xinobits); + return ino; + } + + return ino | ((u64)fsid) << (64 - xinobits); +} + /* * Set d_ino for upper entries. Non-upper entries should always report * the uppermost real inode ino and should not call this function. @@ -452,9 +469,10 @@ static int ovl_cache_update_ino(struct path *path, struct ovl_cache_entry *p) struct dentry *this = NULL; enum ovl_path_type type; u64 ino = p->real_ino; + int xinobits = ovl_xino_bits(dir->d_sb); int err = 0; - if (!ovl_same_sb(dir->d_sb)) + if (!ovl_same_sb(dir->d_sb) && !xinobits) goto out; if (p->name[0] == '.') { @@ -491,6 +509,10 @@ get: WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev); ino = stat.ino; + } else if (xinobits && !OVL_TYPE_UPPER(type)) { + ino = ovl_remap_lower_ino(ino, xinobits, + ovl_layer_lower(this)->fsid, + p->name, p->len); } out: @@ -618,6 +640,8 @@ struct ovl_readdir_translate { struct ovl_dir_cache *cache; struct dir_context ctx; u64 parent_ino; + int fsid; + int xinobits; }; static int ovl_fill_real(struct dir_context *ctx, const char *name, @@ -628,14 +652,17 @@ static int ovl_fill_real(struct dir_context *ctx, const char *name, container_of(ctx, struct ovl_readdir_translate, ctx); struct dir_context *orig_ctx = rdt->orig_ctx; - if (rdt->parent_ino && strcmp(name, "..") == 0) + if (rdt->parent_ino && strcmp(name, "..") == 0) { ino = rdt->parent_ino; - else if (rdt->cache) { + } else if (rdt->cache) { struct ovl_cache_entry *p; p = ovl_cache_entry_find(&rdt->cache->root, name, namelen); if (p) ino = p->ino; + } else if (rdt->xinobits) { + ino = ovl_remap_lower_ino(ino, rdt->xinobits, rdt->fsid, + name, namelen); } return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type); @@ -646,11 +673,16 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx) int err; struct ovl_dir_file *od = file->private_data; struct dentry *dir = file->f_path.dentry; + struct ovl_layer *lower_layer = ovl_layer_lower(dir); struct ovl_readdir_translate rdt = { .ctx.actor = ovl_fill_real, .orig_ctx = ctx, + .xinobits = ovl_xino_bits(dir->d_sb), }; + if (rdt.xinobits && lower_layer) + rdt.fsid = lower_layer->fsid; + if (OVL_TYPE_MERGE(ovl_path_type(dir->d_parent))) { struct kstat stat; struct path statpath = file->f_path; @@ -693,9 +725,10 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) * dir is impure then need to adjust d_ino for copied up * entries. */ - if (ovl_same_sb(dentry->d_sb) && - (ovl_test_flag(OVL_IMPURE, d_inode(dentry)) || - OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent)))) { + if (ovl_xino_bits(dentry->d_sb) || + (ovl_same_sb(dentry->d_sb) && + (ovl_test_flag(OVL_IMPURE, d_inode(dentry)) || + OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))))) { return ovl_iterate_real(file, ctx); } return iterate_dir(od->realfile, ctx); From 795939a93e600587e52c34fe02402b27ddda6017 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 29 Mar 2018 09:08:18 +0300 Subject: [PATCH 18/19] ovl: add support for "xino" mount and config options With mount option "xino=on", mounter declares that there are enough free high bits in underlying fs to hold the layer fsid. If overlayfs does encounter underlying inodes using the high xino bits reserved for layer fsid, a warning will be emitted and the original inode number will be used. The mount option name "xino" goes after a similar meaning mount option of aufs, but in overlayfs case, the mapping is stateless. An example for a use case of "xino=on" is when upper/lower is on an xfs filesystem. xfs uses 64bit inode numbers, but it currently never uses the upper 8bit for inode numbers exposed via stat(2) and that is not likely to change in the future without user opting-in for a new xfs feature. The actual number of unused upper bit is much larger and determined by the xfs filesystem geometry (64 - agno_log - agblklog - inopblog). That means that for all practical purpose, there are enough unused bits in xfs inode numbers for more than OVL_MAX_STACK unique fsid's. Another use case of "xino=on" is when upper/lower is on tmpfs. tmpfs inode numbers are allocated sequentially since boot, so they will practially never use the high inode number bits. For compatibility with applications that expect 32bit inodes, the feature can be disabled with "xino=off". The option "xino=auto" automatically detects underlying filesystem that use 32bit inodes and enables the feature. The Kconfig option OVERLAY_FS_XINO_AUTO and module parameter of the same name, determine if the default mode for overlayfs mount is "xino=auto" or "xino=off". Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/Kconfig | 17 ++++++++++ fs/overlayfs/ovl_entry.h | 1 + fs/overlayfs/super.c | 73 ++++++++++++++++++++++++++++++++++++++-- 3 files changed, 88 insertions(+), 3 deletions(-) diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index ce6ff5a0a6e4..17032631c5cf 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -86,3 +86,20 @@ config OVERLAY_FS_NFS_EXPORT case basis with the "nfs_export=on" mount option. Say N unless you fully understand the consequences. + +config OVERLAY_FS_XINO_AUTO + bool "Overlayfs: auto enable inode number mapping" + default n + depends on OVERLAY_FS + help + If this config option is enabled then overlay filesystems will use + unused high bits in undelying filesystem inode numbers to map all + inodes to a unified address space. The mapped 64bit inode numbers + might not be compatible with applications that expect 32bit inodes. + + If compatibility with applications that expect 32bit inodes is not an + issue, then it is safe and recommended to say Y here. + + For more information, see Documentation/filesystems/overlayfs.txt + + If unsure, say N. diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 620bd20f9a22..41655a7d6894 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -18,6 +18,7 @@ struct ovl_config { const char *redirect_mode; bool index; bool nfs_export; + int xino; }; struct ovl_sb { diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index d7284444f404..e8551c97de51 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -51,6 +51,11 @@ module_param_named(nfs_export, ovl_nfs_export_def, bool, 0644); MODULE_PARM_DESC(ovl_nfs_export_def, "Default to on or off for the NFS export feature"); +static bool ovl_xino_auto_def = IS_ENABLED(CONFIG_OVERLAY_FS_XINO_AUTO); +module_param_named(xino_auto, ovl_xino_auto_def, bool, 0644); +MODULE_PARM_DESC(ovl_xino_auto_def, + "Auto enable xino feature"); + static void ovl_entry_stack_free(struct ovl_entry *oe) { unsigned int i; @@ -327,6 +332,23 @@ static const char *ovl_redirect_mode_def(void) return ovl_redirect_dir_def ? "on" : "off"; } +enum { + OVL_XINO_OFF, + OVL_XINO_AUTO, + OVL_XINO_ON, +}; + +static const char * const ovl_xino_str[] = { + "off", + "auto", + "on", +}; + +static inline int ovl_xino_def(void) +{ + return ovl_xino_auto_def ? OVL_XINO_AUTO : OVL_XINO_OFF; +} + /** * ovl_show_options * @@ -352,6 +374,8 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) if (ofs->config.nfs_export != ovl_nfs_export_def) seq_printf(m, ",nfs_export=%s", ofs->config.nfs_export ? "on" : "off"); + if (ofs->config.xino != ovl_xino_def()) + seq_printf(m, ",xino=%s", ovl_xino_str[ofs->config.xino]); return 0; } @@ -386,6 +410,9 @@ enum { OPT_INDEX_OFF, OPT_NFS_EXPORT_ON, OPT_NFS_EXPORT_OFF, + OPT_XINO_ON, + OPT_XINO_OFF, + OPT_XINO_AUTO, OPT_ERR, }; @@ -399,6 +426,9 @@ static const match_table_t ovl_tokens = { {OPT_INDEX_OFF, "index=off"}, {OPT_NFS_EXPORT_ON, "nfs_export=on"}, {OPT_NFS_EXPORT_OFF, "nfs_export=off"}, + {OPT_XINO_ON, "xino=on"}, + {OPT_XINO_OFF, "xino=off"}, + {OPT_XINO_AUTO, "xino=auto"}, {OPT_ERR, NULL} }; @@ -513,6 +543,18 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) config->nfs_export = false; break; + case OPT_XINO_ON: + config->xino = OVL_XINO_ON; + break; + + case OPT_XINO_OFF: + config->xino = OVL_XINO_OFF; + break; + + case OPT_XINO_AUTO: + config->xino = OVL_XINO_AUTO; + break; + default: pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p); return -EINVAL; @@ -1197,9 +1239,31 @@ static int ovl_get_lower_layers(struct ovl_fs *ofs, struct path *stack, ofs->numlower++; } - /* When all layers on same fs, overlay can use real inode numbers */ - if (!ofs->numlowerfs || (ofs->numlowerfs == 1 && !ofs->upper_mnt)) + /* + * When all layers on same fs, overlay can use real inode numbers. + * With mount option "xino=on", mounter declares that there are enough + * free high bits in underlying fs to hold the unique fsid. + * If overlayfs does encounter underlying inodes using the high xino + * bits reserved for fsid, it emits a warning and uses the original + * inode number. + */ + if (!ofs->numlowerfs || (ofs->numlowerfs == 1 && !ofs->upper_mnt)) { ofs->xino_bits = 0; + ofs->config.xino = OVL_XINO_OFF; + } else if (ofs->config.xino == OVL_XINO_ON && !ofs->xino_bits) { + /* + * This is a roundup of number of bits needed for numlowerfs+1 + * (i.e. ilog2(numlowerfs+1 - 1) + 1). fsid 0 is reserved for + * upper fs even with non upper overlay. + */ + BUILD_BUG_ON(ilog2(OVL_MAX_STACK) > 31); + ofs->xino_bits = ilog2(ofs->numlowerfs) + 1; + } + + if (ofs->xino_bits) { + pr_info("overlayfs: \"xino\" feature enabled using %d upper inode bits.\n", + ofs->xino_bits); + } err = 0; out: @@ -1311,6 +1375,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) ofs->config.index = ovl_index_def; ofs->config.nfs_export = ovl_nfs_export_def; + ofs->config.xino = ovl_xino_def(); err = ovl_parse_opt((char *) data, &ofs->config); if (err) goto out_err; @@ -1325,7 +1390,9 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_stack_depth = 0; sb->s_maxbytes = MAX_LFS_FILESIZE; /* Assume underlaying fs uses 32bit inodes unless proven otherwise */ - ofs->xino_bits = BITS_PER_LONG - 32; + if (ofs->config.xino != OVL_XINO_OFF) + ofs->xino_bits = BITS_PER_LONG - 32; + if (ofs->config.upperdir) { if (!ofs->config.workdir) { pr_err("overlayfs: missing 'workdir'\n"); From 16149013f8391d3bd7bcbcbe3bfd2fd06ba110e4 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 29 Mar 2018 16:36:56 +0300 Subject: [PATCH 19/19] ovl: update documentation w.r.t "xino" feature Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- Documentation/filesystems/overlayfs.txt | 39 +++++++++++++++++++++---- 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt index 6ea1e64d1464..961b287ef323 100644 --- a/Documentation/filesystems/overlayfs.txt +++ b/Documentation/filesystems/overlayfs.txt @@ -14,9 +14,13 @@ The result will inevitably fail to look exactly like a normal filesystem for various technical reasons. The expectation is that many use cases will be able to ignore these differences. -This approach is 'hybrid' because the objects that appear in the -filesystem do not all appear to belong to that filesystem. In many -cases an object accessed in the union will be indistinguishable + +Overlay objects +--------------- + +The overlay filesystem approach is 'hybrid', because the objects that +appear in the filesystem do not always appear to belong to that filesystem. +In many cases, an object accessed in the union will be indistinguishable from accessing the corresponding object from the original filesystem. This is most obvious from the 'st_dev' field returned by stat(2). @@ -34,6 +38,19 @@ make the overlay mount more compliant with filesystem scanners and overlay objects will be distinguishable from the corresponding objects in the original filesystem. +On 64bit systems, even if all overlay layers are not on the same +underlying filesystem, the same compliant behavior could be achieved +with the "xino" feature. The "xino" feature composes a unique object +identifier from the real object st_ino and an underlying fsid index. +If all underlying filesystems support NFS file handles and export file +handles with 32bit inode number encoding (e.g. ext4), overlay filesystem +will use the high inode number bits for fsid. Even when the underlying +filesystem uses 64bit inode numbers, users can still enable the "xino" +feature with the "-o xino=on" overlay mount option. That is useful for the +case of underlying filesystems like xfs and tmpfs, which use 64bit inode +numbers, but are very unlikely to use the high inode number bit. + + Upper and Lower --------------- @@ -290,10 +307,19 @@ Non-standard behavior --------------------- The copy_up operation essentially creates a new, identical file and -moves it over to the old name. The new file may be on a different -filesystem, so both st_dev and st_ino of the file may change. +moves it over to the old name. Any open files referring to this inode +will access the old data. -Any open files referring to this inode will access the old data. +The new file may be on a different filesystem, so both st_dev and st_ino +of the real file may change. The values of st_dev and st_ino returned by +stat(2) on an overlay object are often not the same as the real file +stat(2) values to prevent the values from changing on copy_up. + +Unless "xino" feature is enabled, when overlay layers are not all on the +same underlying filesystem, the value of st_dev may be different for two +non-directory objects in the same overlay filesystem and the value of +st_ino for directory objects may be non persistent and could change even +while the overlay filesystem is still mounted. Unless "inode index" feature is enabled, if a file with multiple hard links is copied up, then this will "break" the link. Changes will not be @@ -302,6 +328,7 @@ propagated to other names referring to the same inode. Unless "redirect_dir" feature is enabled, rename(2) on a lower or merged directory will fail with EXDEV. + Changes to underlying filesystems ---------------------------------