From 1b52931ca9b5b87e237c591f99201b6254c00809 Mon Sep 17 00:00:00 2001 From: Zhi Zhang Date: Fri, 22 Mar 2019 14:16:33 +0800 Subject: [PATCH 01/29] ceph: remove duplicated filelock ref increase Inode i_filelock_ref is increased in ceph_lock or ceph_flock, but it is increased again in ceph_lock_message. This results in this ref won't become zero. If CEPH_I_ERROR_FILELOCK flag is set in remove_session_caps once, this flag can't be cleared even if client is back to normal. So further file lock will return EIO. Signed-off-by: Zhi Zhang Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/locks.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 9dae2ec7e1fa..ac9b53b89365 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -237,15 +237,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { err = -EIO; - } else if (op == CEPH_MDS_OP_SETFILELOCK) { - /* - * increasing i_filelock_ref closes race window between - * handling request reply and adding file_lock struct to - * inode. Otherwise, i_auth_cap may get trimmed in the - * window. Caller function will decrease the counter. - */ - fl->fl_ops = &ceph_fl_lock_ops; - atomic_inc(&ci->i_filelock_ref); } spin_unlock(&ci->i_ceph_lock); if (err < 0) { @@ -299,10 +290,6 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { err = -EIO; - } else { - /* see comment in ceph_lock */ - fl->fl_ops = &ceph_fl_lock_ops; - atomic_inc(&ci->i_filelock_ref); } spin_unlock(&ci->i_ceph_lock); if (err < 0) { From 3886274adf34a4e38417772e3d1c0b213380004e Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Thu, 21 Mar 2019 10:20:09 +0000 Subject: [PATCH 02/29] ceph: factor out ceph_lookup_inode() This function will be used by __fh_to_dentry and by the quotas code, to find quota realm inodes that are not visible in the mountpoint. Signed-off-by: Luis Henriques Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/export.c | 14 ++++++++++++-- fs/ceph/super.h | 1 + 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 3c59ad180ef0..d64e7472fa41 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -59,7 +59,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, return type; } -static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) +struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino) { struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; struct inode *inode; @@ -91,13 +91,23 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) ihold(inode); ceph_mdsc_put_request(req); if (!inode) - return ERR_PTR(-ESTALE); + return err < 0 ? ERR_PTR(err) : ERR_PTR(-ESTALE); if (inode->i_nlink == 0) { iput(inode); return ERR_PTR(-ESTALE); } } + return inode; +} + +static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) +{ + struct inode *inode = ceph_lookup_inode(sb, ino); + + if (IS_ERR(inode)) + return ERR_CAST(inode); + return d_obtain_alias(inode); } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 16c03188578e..976f200164f9 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1082,6 +1082,7 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg); /* export.c */ extern const struct export_operations ceph_export_ops; +struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino); /* locks.c */ extern __init void ceph_flock_init(void); From 0c44a8e0fc55f56a70f72e67d7cc5b9341dae7d1 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Thu, 21 Mar 2019 10:20:10 +0000 Subject: [PATCH 03/29] ceph: quota: fix quota subdir mounts The CephFS kernel client does not enforce quotas set in a directory that isn't visible from the mount point. For example, given the path '/dir1/dir2', if quotas are set in 'dir1' and the filesystem is mounted with mount -t ceph ::/dir1/ /mnt then the client won't be able to access 'dir1' inode, even if 'dir2' belongs to a quota realm that points to it. This patch fixes this issue by simply doing an MDS LOOKUPINO operation for unknown inodes. Any inode reference obtained this way will be added to a list in ceph_mds_client, and will only be released when the filesystem is umounted. Link: https://tracker.ceph.com/issues/38482 Reported-by: Hendrik Peyerl Signed-off-by: Luis Henriques Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 4 + fs/ceph/mds_client.h | 18 +++++ fs/ceph/quota.c | 181 ++++++++++++++++++++++++++++++++++++++++--- fs/ceph/super.h | 1 + 4 files changed, 192 insertions(+), 12 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 9049c2a3e972..5dee98b4cfde 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4125,6 +4125,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) mdsc->max_sessions = 0; mdsc->stopping = 0; atomic64_set(&mdsc->quotarealms_count, 0); + mdsc->quotarealms_inodes = RB_ROOT; + mutex_init(&mdsc->quotarealms_inodes_mutex); mdsc->last_snap_seq = 0; init_rwsem(&mdsc->snap_rwsem); mdsc->snap_realms = RB_ROOT; @@ -4216,6 +4218,8 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) * their inode/dcache refs */ ceph_msgr_flush(); + + ceph_cleanup_quotarealms_inodes(mdsc); } /* diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 50385a481fdb..3f0029aa8a39 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -325,6 +325,18 @@ struct ceph_snapid_map { unsigned long last_used; }; +/* + * node for list of quotarealm inodes that are not visible from the filesystem + * mountpoint, but required to handle, e.g. quotas. + */ +struct ceph_quotarealm_inode { + struct rb_node node; + u64 ino; + unsigned long timeout; /* last time a lookup failed for this inode */ + struct mutex mutex; + struct inode *inode; +}; + /* * mds client state */ @@ -344,6 +356,12 @@ struct ceph_mds_client { int stopping; /* true if shutting down */ atomic64_t quotarealms_count; /* # realms with quota */ + /* + * We keep a list of inodes we don't see in the mountpoint but that we + * need to track quota realms. + */ + struct rb_root quotarealms_inodes; + struct mutex quotarealms_inodes_mutex; /* * snap_rwsem will cover cap linkage into snaprealms, and diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 9455d3aef0c3..c4522212872c 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -22,7 +22,16 @@ void ceph_adjust_quota_realms_count(struct inode *inode, bool inc) static inline bool ceph_has_realms_with_quotas(struct inode *inode) { struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; - return atomic64_read(&mdsc->quotarealms_count) > 0; + struct super_block *sb = mdsc->fsc->sb; + + if (atomic64_read(&mdsc->quotarealms_count) > 0) + return true; + /* if root is the real CephFS root, we don't have quota realms */ + if (sb->s_root->d_inode && + (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT)) + return false; + /* otherwise, we can't know for sure */ + return true; } void ceph_handle_quota(struct ceph_mds_client *mdsc, @@ -68,6 +77,108 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc, iput(inode); } +static struct ceph_quotarealm_inode * +find_quotarealm_inode(struct ceph_mds_client *mdsc, u64 ino) +{ + struct ceph_quotarealm_inode *qri = NULL; + struct rb_node **node, *parent = NULL; + + mutex_lock(&mdsc->quotarealms_inodes_mutex); + node = &(mdsc->quotarealms_inodes.rb_node); + while (*node) { + parent = *node; + qri = container_of(*node, struct ceph_quotarealm_inode, node); + + if (ino < qri->ino) + node = &((*node)->rb_left); + else if (ino > qri->ino) + node = &((*node)->rb_right); + else + break; + } + if (!qri || (qri->ino != ino)) { + /* Not found, create a new one and insert it */ + qri = kmalloc(sizeof(*qri), GFP_KERNEL); + if (qri) { + qri->ino = ino; + qri->inode = NULL; + qri->timeout = 0; + mutex_init(&qri->mutex); + rb_link_node(&qri->node, parent, node); + rb_insert_color(&qri->node, &mdsc->quotarealms_inodes); + } else + pr_warn("Failed to alloc quotarealms_inode\n"); + } + mutex_unlock(&mdsc->quotarealms_inodes_mutex); + + return qri; +} + +/* + * This function will try to lookup a realm inode which isn't visible in the + * filesystem mountpoint. A list of these kind of inodes (not visible) is + * maintained in the mdsc and freed only when the filesystem is umounted. + * + * Note that these inodes are kept in this list even if the lookup fails, which + * allows to prevent useless lookup requests. + */ +static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc, + struct super_block *sb, + struct ceph_snap_realm *realm) +{ + struct ceph_quotarealm_inode *qri; + struct inode *in; + + qri = find_quotarealm_inode(mdsc, realm->ino); + if (!qri) + return NULL; + + mutex_lock(&qri->mutex); + if (qri->inode) { + /* A request has already returned the inode */ + mutex_unlock(&qri->mutex); + return qri->inode; + } + /* Check if this inode lookup has failed recently */ + if (qri->timeout && + time_before_eq(jiffies, qri->timeout)) { + mutex_unlock(&qri->mutex); + return NULL; + } + in = ceph_lookup_inode(sb, realm->ino); + if (IS_ERR(in)) { + pr_warn("Can't lookup inode %llx (err: %ld)\n", + realm->ino, PTR_ERR(in)); + qri->timeout = jiffies + msecs_to_jiffies(60 * 1000); /* XXX */ + } else { + qri->timeout = 0; + qri->inode = in; + } + mutex_unlock(&qri->mutex); + + return in; +} + +void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc) +{ + struct ceph_quotarealm_inode *qri; + struct rb_node *node; + + /* + * It should now be safe to clean quotarealms_inode tree without holding + * mdsc->quotarealms_inodes_mutex... + */ + mutex_lock(&mdsc->quotarealms_inodes_mutex); + while (!RB_EMPTY_ROOT(&mdsc->quotarealms_inodes)) { + node = rb_first(&mdsc->quotarealms_inodes); + qri = rb_entry(node, struct ceph_quotarealm_inode, node); + rb_erase(node, &mdsc->quotarealms_inodes); + iput(qri->inode); + kfree(qri); + } + mutex_unlock(&mdsc->quotarealms_inodes_mutex); +} + /* * This function walks through the snaprealm for an inode and returns the * ceph_snap_realm for the first snaprealm that has quotas set (either max_files @@ -76,9 +187,15 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc, * * Note that the caller is responsible for calling ceph_put_snap_realm() on the * returned realm. + * + * Callers of this function need to hold mdsc->snap_rwsem. However, if there's + * a need to do an inode lookup, this rwsem will be temporarily dropped. Hence + * the 'retry' argument: if rwsem needs to be dropped and 'retry' is 'false' + * this function will return -EAGAIN; otherwise, the snaprealms walk-through + * will be restarted. */ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, - struct inode *inode) + struct inode *inode, bool retry) { struct ceph_inode_info *ci = NULL; struct ceph_snap_realm *realm, *next; @@ -88,6 +205,7 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, if (ceph_snap(inode) != CEPH_NOSNAP) return NULL; +restart: realm = ceph_inode(inode)->i_snap_realm; if (realm) ceph_get_snap_realm(mdsc, realm); @@ -95,11 +213,25 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, pr_err_ratelimited("get_quota_realm: ino (%llx.%llx) " "null i_snap_realm\n", ceph_vinop(inode)); while (realm) { + bool has_inode; + spin_lock(&realm->inodes_with_caps_lock); - in = realm->inode ? igrab(realm->inode) : NULL; + has_inode = realm->inode; + in = has_inode ? igrab(realm->inode) : NULL; spin_unlock(&realm->inodes_with_caps_lock); - if (!in) + if (has_inode && !in) break; + if (!in) { + up_read(&mdsc->snap_rwsem); + in = lookup_quotarealm_inode(mdsc, inode->i_sb, realm); + down_read(&mdsc->snap_rwsem); + if (IS_ERR_OR_NULL(in)) + break; + ceph_put_snap_realm(mdsc, realm); + if (!retry) + return ERR_PTR(-EAGAIN); + goto restart; + } ci = ceph_inode(in); has_quota = __ceph_has_any_quota(ci); @@ -125,9 +257,22 @@ bool ceph_quota_is_same_realm(struct inode *old, struct inode *new) struct ceph_snap_realm *old_realm, *new_realm; bool is_same; +restart: + /* + * We need to lookup 2 quota realms atomically, i.e. with snap_rwsem. + * However, get_quota_realm may drop it temporarily. By setting the + * 'retry' parameter to 'false', we'll get -EAGAIN if the rwsem was + * dropped and we can then restart the whole operation. + */ down_read(&mdsc->snap_rwsem); - old_realm = get_quota_realm(mdsc, old); - new_realm = get_quota_realm(mdsc, new); + old_realm = get_quota_realm(mdsc, old, true); + new_realm = get_quota_realm(mdsc, new, false); + if (PTR_ERR(new_realm) == -EAGAIN) { + up_read(&mdsc->snap_rwsem); + if (old_realm) + ceph_put_snap_realm(mdsc, old_realm); + goto restart; + } is_same = (old_realm == new_realm); up_read(&mdsc->snap_rwsem); @@ -166,6 +311,7 @@ static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op, return false; down_read(&mdsc->snap_rwsem); +restart: realm = ceph_inode(inode)->i_snap_realm; if (realm) ceph_get_snap_realm(mdsc, realm); @@ -173,12 +319,23 @@ static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op, pr_err_ratelimited("check_quota_exceeded: ino (%llx.%llx) " "null i_snap_realm\n", ceph_vinop(inode)); while (realm) { - spin_lock(&realm->inodes_with_caps_lock); - in = realm->inode ? igrab(realm->inode) : NULL; - spin_unlock(&realm->inodes_with_caps_lock); - if (!in) - break; + bool has_inode; + spin_lock(&realm->inodes_with_caps_lock); + has_inode = realm->inode; + in = has_inode ? igrab(realm->inode) : NULL; + spin_unlock(&realm->inodes_with_caps_lock); + if (has_inode && !in) + break; + if (!in) { + up_read(&mdsc->snap_rwsem); + in = lookup_quotarealm_inode(mdsc, inode->i_sb, realm); + down_read(&mdsc->snap_rwsem); + if (IS_ERR_OR_NULL(in)) + break; + ceph_put_snap_realm(mdsc, realm); + goto restart; + } ci = ceph_inode(in); spin_lock(&ci->i_ceph_lock); if (op == QUOTA_CHECK_MAX_FILES_OP) { @@ -314,7 +471,7 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) bool is_updated = false; down_read(&mdsc->snap_rwsem); - realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root)); + realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), true); up_read(&mdsc->snap_rwsem); if (!realm) return false; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 976f200164f9..a4b0da31d199 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1133,5 +1133,6 @@ extern bool ceph_quota_is_max_bytes_approaching(struct inode *inode, loff_t newlen); extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf); +extern void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc); #endif /* _FS_CEPH_SUPER_H */ From 570df4e9c23f861aa3f8f2954468c534a033bf1a Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 15 Nov 2017 17:39:40 +0800 Subject: [PATCH 04/29] ceph: snapshot nfs re-export To support snapshot nfs re-export, we need a way to lookup snapped inode by file handle. For directory inode, snapped metadata are always stored together with head inode. Client just need to pass vinodeno_t to MDS. For non-directory inode, there can be multiple version of snapped inodes and they can be stored in different dirfrags. Besides vinodeno_t, client also need to tell mds from which dirfrag it got the snapped inode. Another problem of supporting snapshot nfs re-export is that there can be multiple paths to access a snapped inode. For example: mkdir -p d1/d2/d3 mkdir d1/.snap/s1 Paths 'd1/.snap/s1/d2/d3', 'd1/d2/.snap/_s1_/d3' and 'd1/d2/d3/.snap/_s1_' are all reference to the same snapped inode. For a given snapped inode, There is no convenient way to get the first form and the second form paths. For simplicity, ceph_get_parent() return snapdir for snapped directory inode. Furthermore, client may access snapshot of deleted directory. For example: mkdir -p d1/d2 mkdir d1/.snap/s1 open d1/.snap/s1/d2 rm -rf d1/d2 The path constucted by ceph_get_parent() and ceph_get_name() is '/.snap/_s1_'. Futher lookup parent of will fail. To workaround this case, this patch uses d_obtain_root() to get dentry for snapdir of deleted directory. snapdir dentry has no DCACHE_DISCONNECTED flag set, reconnect_path() stops when it reaches snapdir dentry. Link: http://tracker.ceph.com/issues/22105 Signed-off-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/export.c | 348 ++++++++++++++++++++++++++++++++--- include/linux/ceph/ceph_fs.h | 6 + 2 files changed, 328 insertions(+), 26 deletions(-) diff --git a/fs/ceph/export.c b/fs/ceph/export.c index d64e7472fa41..d3ef7ee429ec 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -22,18 +22,77 @@ struct ceph_nfs_confh { u64 ino, parent_ino; } __attribute__ ((packed)); +/* + * fh for snapped inode + */ +struct ceph_nfs_snapfh { + u64 ino; + u64 snapid; + u64 parent_ino; + u32 hash; +} __attribute__ ((packed)); + +static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len, + struct inode *parent_inode) +{ + const static int snap_handle_length = + sizeof(struct ceph_nfs_snapfh) >> 2; + struct ceph_nfs_snapfh *sfh = (void *)rawfh; + u64 snapid = ceph_snap(inode); + int ret; + bool no_parent = true; + + if (*max_len < snap_handle_length) { + *max_len = snap_handle_length; + ret = FILEID_INVALID; + goto out; + } + + ret = -EINVAL; + if (snapid != CEPH_SNAPDIR) { + struct inode *dir; + struct dentry *dentry = d_find_alias(inode); + if (!dentry) + goto out; + + rcu_read_lock(); + dir = d_inode_rcu(dentry->d_parent); + if (ceph_snap(dir) != CEPH_SNAPDIR) { + sfh->parent_ino = ceph_ino(dir); + sfh->hash = ceph_dentry_hash(dir, dentry); + no_parent = false; + } + rcu_read_unlock(); + dput(dentry); + } + + if (no_parent) { + if (!S_ISDIR(inode->i_mode)) + goto out; + sfh->parent_ino = sfh->ino; + sfh->hash = 0; + } + sfh->ino = ceph_ino(inode); + sfh->snapid = snapid; + + *max_len = snap_handle_length; + ret = FILEID_BTRFS_WITH_PARENT; +out: + dout("encode_snapfh %llx.%llx ret=%d\n", ceph_vinop(inode), ret); + return ret; +} + static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, struct inode *parent_inode) { + const static int handle_length = + sizeof(struct ceph_nfs_fh) >> 2; + const static int connected_handle_length = + sizeof(struct ceph_nfs_confh) >> 2; int type; - struct ceph_nfs_fh *fh = (void *)rawfh; - struct ceph_nfs_confh *cfh = (void *)rawfh; - int connected_handle_length = sizeof(*cfh)/4; - int handle_length = sizeof(*fh)/4; - /* don't re-export snaps */ if (ceph_snap(inode) != CEPH_NOSNAP) - return -EINVAL; + return ceph_encode_snapfh(inode, rawfh, max_len, parent_inode); if (parent_inode && (*max_len < connected_handle_length)) { *max_len = connected_handle_length; @@ -44,6 +103,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, } if (parent_inode) { + struct ceph_nfs_confh *cfh = (void *)rawfh; dout("encode_fh %llx with parent %llx\n", ceph_ino(inode), ceph_ino(parent_inode)); cfh->ino = ceph_ino(inode); @@ -51,6 +111,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, *max_len = connected_handle_length; type = FILEID_INO32_GEN_PARENT; } else { + struct ceph_nfs_fh *fh = (void *)rawfh; dout("encode_fh %llx\n", ceph_ino(inode)); fh->ino = ceph_ino(inode); *max_len = handle_length; @@ -59,7 +120,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, return type; } -struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino) +static struct inode *__lookup_inode(struct super_block *sb, u64 ino) { struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; struct inode *inode; @@ -81,7 +142,7 @@ struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino) mask = CEPH_STAT_CAP_INODE; if (ceph_security_xattr_wanted(d_inode(sb->s_root))) mask |= CEPH_CAP_XATTR_SHARED; - req->r_args.getattr.mask = cpu_to_le32(mask); + req->r_args.lookupino.mask = cpu_to_le32(mask); req->r_ino1 = vino; req->r_num_caps = 1; @@ -92,25 +153,113 @@ struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino) ceph_mdsc_put_request(req); if (!inode) return err < 0 ? ERR_PTR(err) : ERR_PTR(-ESTALE); - if (inode->i_nlink == 0) { - iput(inode); - return ERR_PTR(-ESTALE); - } } + return inode; +} +struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino) +{ + struct inode *inode = __lookup_inode(sb, ino); + if (IS_ERR(inode)) + return inode; + if (inode->i_nlink == 0) { + iput(inode); + return ERR_PTR(-ESTALE); + } return inode; } static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) { - struct inode *inode = ceph_lookup_inode(sb, ino); - + struct inode *inode = __lookup_inode(sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); - + if (inode->i_nlink == 0) { + iput(inode); + return ERR_PTR(-ESTALE); + } return d_obtain_alias(inode); } +static struct dentry *__snapfh_to_dentry(struct super_block *sb, + struct ceph_nfs_snapfh *sfh, + bool want_parent) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; + struct ceph_mds_request *req; + struct inode *inode; + struct ceph_vino vino; + int mask; + int err; + bool unlinked = false; + + if (want_parent) { + vino.ino = sfh->parent_ino; + if (sfh->snapid == CEPH_SNAPDIR) + vino.snap = CEPH_NOSNAP; + else if (sfh->ino == sfh->parent_ino) + vino.snap = CEPH_SNAPDIR; + else + vino.snap = sfh->snapid; + } else { + vino.ino = sfh->ino; + vino.snap = sfh->snapid; + } + inode = ceph_find_inode(sb, vino); + if (inode) + return d_obtain_alias(inode); + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO, + USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_CAST(req); + + mask = CEPH_STAT_CAP_INODE; + if (ceph_security_xattr_wanted(d_inode(sb->s_root))) + mask |= CEPH_CAP_XATTR_SHARED; + req->r_args.lookupino.mask = cpu_to_le32(mask); + if (vino.snap < CEPH_NOSNAP) { + req->r_args.lookupino.snapid = cpu_to_le64(vino.snap); + if (!want_parent && sfh->ino != sfh->parent_ino) { + req->r_args.lookupino.parent = + cpu_to_le64(sfh->parent_ino); + req->r_args.lookupino.hash = + cpu_to_le32(sfh->hash); + } + } + + req->r_ino1 = vino; + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, NULL, req); + inode = req->r_target_inode; + if (inode) { + if (vino.snap == CEPH_SNAPDIR) { + if (inode->i_nlink == 0) + unlinked = true; + inode = ceph_get_snapdir(inode); + } else if (ceph_snap(inode) == vino.snap) { + ihold(inode); + } else { + /* mds does not support lookup snapped inode */ + err = -EOPNOTSUPP; + inode = NULL; + } + } + ceph_mdsc_put_request(req); + + if (want_parent) { + dout("snapfh_to_parent %llx.%llx\n err=%d\n", + vino.ino, vino.snap, err); + } else { + dout("snapfh_to_dentry %llx.%llx parent %llx hash %x err=%d", + vino.ino, vino.snap, sfh->parent_ino, sfh->hash, err); + } + if (!inode) + return ERR_PTR(-ESTALE); + /* see comments in ceph_get_parent() */ + return unlinked ? d_obtain_root(inode) : d_obtain_alias(inode); +} + /* * convert regular fh to dentry */ @@ -120,6 +269,11 @@ static struct dentry *ceph_fh_to_dentry(struct super_block *sb, { struct ceph_nfs_fh *fh = (void *)fid->raw; + if (fh_type == FILEID_BTRFS_WITH_PARENT) { + struct ceph_nfs_snapfh *sfh = (void *)fid->raw; + return __snapfh_to_dentry(sb, sfh, false); + } + if (fh_type != FILEID_INO32_GEN && fh_type != FILEID_INO32_GEN_PARENT) return NULL; @@ -173,13 +327,49 @@ static struct dentry *__get_parent(struct super_block *sb, static struct dentry *ceph_get_parent(struct dentry *child) { - /* don't re-export snaps */ - if (ceph_snap(d_inode(child)) != CEPH_NOSNAP) - return ERR_PTR(-EINVAL); + struct inode *inode = d_inode(child); + struct dentry *dn; - dout("get_parent %p ino %llx.%llx\n", - child, ceph_vinop(d_inode(child))); - return __get_parent(child->d_sb, child, 0); + if (ceph_snap(inode) != CEPH_NOSNAP) { + struct inode* dir; + bool unlinked = false; + /* do not support non-directory */ + if (!d_is_dir(child)) { + dn = ERR_PTR(-EINVAL); + goto out; + } + dir = __lookup_inode(inode->i_sb, ceph_ino(inode)); + if (IS_ERR(dir)) { + dn = ERR_CAST(dir); + goto out; + } + /* There can be multiple paths to access snapped inode. + * For simplicity, treat snapdir of head inode as parent */ + if (ceph_snap(inode) != CEPH_SNAPDIR) { + struct inode *snapdir = ceph_get_snapdir(dir); + if (dir->i_nlink == 0) + unlinked = true; + iput(dir); + if (IS_ERR(snapdir)) { + dn = ERR_CAST(snapdir); + goto out; + } + dir = snapdir; + } + /* If directory has already been deleted, futher get_parent + * will fail. Do not mark snapdir dentry as disconnected, + * this prevent exportfs from doing futher get_parent. */ + if (unlinked) + dn = d_obtain_root(dir); + else + dn = d_obtain_alias(dir); + } else { + dn = __get_parent(child->d_sb, child, 0); + } +out: + dout("get_parent %p ino %llx.%llx err=%ld\n", + child, ceph_vinop(inode), (IS_ERR(dn) ? PTR_ERR(dn) : 0)); + return dn; } /* @@ -192,6 +382,11 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb, struct ceph_nfs_confh *cfh = (void *)fid->raw; struct dentry *dentry; + if (fh_type == FILEID_BTRFS_WITH_PARENT) { + struct ceph_nfs_snapfh *sfh = (void *)fid->raw; + return __snapfh_to_dentry(sb, sfh, true); + } + if (fh_type != FILEID_INO32_GEN_PARENT) return NULL; if (fh_len < sizeof(*cfh) / 4) @@ -204,14 +399,115 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb, return dentry; } +static int __get_snap_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct inode *inode = d_inode(child); + struct inode *dir = d_inode(parent); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_mds_request *req = NULL; + char *last_name = NULL; + unsigned next_offset = 2; + int err = -EINVAL; + + if (ceph_ino(inode) != ceph_ino(dir)) + goto out; + if (ceph_snap(inode) == CEPH_SNAPDIR) { + if (ceph_snap(dir) == CEPH_NOSNAP) { + strcpy(name, fsc->mount_options->snapdir_name); + err = 0; + } + goto out; + } + if (ceph_snap(dir) != CEPH_SNAPDIR) + goto out; + + while (1) { + struct ceph_mds_reply_info_parsed *rinfo; + struct ceph_mds_reply_dir_entry *rde; + int i; + + req = ceph_mdsc_create_request(fsc->mdsc, CEPH_MDS_OP_LSSNAP, + USE_AUTH_MDS); + if (IS_ERR(req)) { + err = PTR_ERR(req); + req = NULL; + goto out; + } + err = ceph_alloc_readdir_reply_buffer(req, inode); + if (err) + goto out; + + req->r_direct_mode = USE_AUTH_MDS; + req->r_readdir_offset = next_offset; + req->r_args.readdir.flags = + cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS); + if (last_name) { + req->r_path2 = last_name; + last_name = NULL; + } + + req->r_inode = dir; + ihold(dir); + req->r_dentry = dget(parent); + + inode_lock(dir); + err = ceph_mdsc_do_request(fsc->mdsc, NULL, req); + inode_unlock(dir); + + if (err < 0) + goto out; + + rinfo = &req->r_reply_info; + for (i = 0; i < rinfo->dir_nr; i++) { + rde = rinfo->dir_entries + i; + BUG_ON(!rde->inode.in); + if (ceph_snap(inode) == + le64_to_cpu(rde->inode.in->snapid)) { + memcpy(name, rde->name, rde->name_len); + name[rde->name_len] = '\0'; + err = 0; + goto out; + } + } + + if (rinfo->dir_end) + break; + + BUG_ON(rinfo->dir_nr <= 0); + rde = rinfo->dir_entries + (rinfo->dir_nr - 1); + next_offset += rinfo->dir_nr; + last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL); + if (!last_name) { + err = -ENOMEM; + goto out; + } + + ceph_mdsc_put_request(req); + req = NULL; + } + err = -ENOENT; +out: + if (req) + ceph_mdsc_put_request(req); + kfree(last_name); + dout("get_snap_name %p ino %llx.%llx err=%d\n", + child, ceph_vinop(inode), err); + return err; +} + static int ceph_get_name(struct dentry *parent, char *name, struct dentry *child) { struct ceph_mds_client *mdsc; struct ceph_mds_request *req; + struct inode *inode = d_inode(child); int err; - mdsc = ceph_inode_to_client(d_inode(child))->mdsc; + if (ceph_snap(inode) != CEPH_NOSNAP) + return __get_snap_name(parent, name, child); + + mdsc = ceph_inode_to_client(inode)->mdsc; req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME, USE_ANY_MDS); if (IS_ERR(req)) @@ -219,8 +515,8 @@ static int ceph_get_name(struct dentry *parent, char *name, inode_lock(d_inode(parent)); - req->r_inode = d_inode(child); - ihold(d_inode(child)); + req->r_inode = inode; + ihold(inode); req->r_ino2 = ceph_vino(d_inode(parent)); req->r_parent = d_inode(parent); set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); @@ -234,10 +530,10 @@ static int ceph_get_name(struct dentry *parent, char *name, memcpy(name, rinfo->dname, rinfo->dname_len); name[rinfo->dname_len] = 0; dout("get_name %p ino %llx.%llx name %s\n", - child, ceph_vinop(d_inode(child)), name); + child, ceph_vinop(inode), name); } else { dout("get_name %p ino %llx.%llx err %d\n", - child, ceph_vinop(d_inode(child)), err); + child, ceph_vinop(inode), err); } ceph_mdsc_put_request(req); diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 4903deb0777a..3ac0feaf2b5e 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -436,6 +436,12 @@ union ceph_mds_request_args { __le64 length; /* num bytes to lock from start */ __u8 wait; /* will caller wait for lock to become available? */ } __attribute__ ((packed)) filelock_change; + struct { + __le32 mask; /* CEPH_CAP_* */ + __le64 snapid; + __le64 parent; + __le32 hash; + } __attribute__ ((packed)) lookupino; } __attribute__ ((packed)); #define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */ From d342a15b1e8547fd033f65ce949a8d8501e5ac8f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 22 Mar 2019 15:36:37 +0100 Subject: [PATCH 05/29] rbd: avoid clang -Wuninitialized warning clang fails to see that rbd_assert(0) ends in an unreachable code path and warns about a subsequent use of an uninitialized variable when CONFIG_PROFILE_ANNOTATED_BRANCHES is set: drivers/block/rbd.c:2402:4: error: variable 'ret' is used uninitialized whenever 'if' condition is false [-Werror,-Wsometimes-uninitialized] rbd_assert(0); ^~~~~~~~~~~~~ drivers/block/rbd.c:563:7: note: expanded from macro 'rbd_assert' if (unlikely(!(expr))) { \ ^~~~~~~~~~~~~~~~~ include/linux/compiler.h:48:23: note: expanded from macro 'unlikely' # define unlikely(x) (__branch_check__(x, 0, __builtin_constant_p(x))) ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/block/rbd.c:2410:6: note: uninitialized use occurs here if (ret) { ^~~ drivers/block/rbd.c:2402:4: note: remove the 'if' if its condition is always true rbd_assert(0); ^ drivers/block/rbd.c:563:3: note: expanded from macro 'rbd_assert' if (unlikely(!(expr))) { \ ^ drivers/block/rbd.c:2376:9: note: initialize the variable 'ret' to silence this warning int ret; ^ = 0 1 error generated. This seems to be a bug in clang, but is easy to work around by using an unconditional BUG(). Signed-off-by: Arnd Bergmann Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 2210c1b9491b..414037fef3ef 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2383,7 +2383,7 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req) &obj_req->bvec_pos); break; default: - rbd_assert(0); + BUG(); } } else { ret = rbd_img_fill_from_bvecs(child_img_req, From 1680937266587906138752c2dfc80a45adb774c7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 22 Mar 2019 17:53:56 +0100 Subject: [PATCH 06/29] rbd: convert all rbd_assert(0) to BUG() rbd_assert(0) has caused different issues depending on the compiler version in the past, so it seems better to avoid it completely. Replace the remaining instances. Signed-off-by: Arnd Bergmann Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 414037fef3ef..612e4aecfa7f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1326,7 +1326,7 @@ static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off, zero_bvecs(&obj_req->bvec_pos, off, bytes); break; default: - rbd_assert(0); + BUG(); } } @@ -1581,7 +1581,7 @@ static void rbd_obj_request_destroy(struct kref *kref) kfree(obj_request->bvec_pos.bvecs); break; default: - rbd_assert(0); + BUG(); } kfree(obj_request->img_extents); @@ -1781,7 +1781,7 @@ static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which) &obj_req->bvec_pos); break; default: - rbd_assert(0); + BUG(); } } @@ -2036,7 +2036,7 @@ static int __rbd_img_fill_request(struct rbd_img_request *img_req) ret = rbd_obj_setup_zeroout(obj_req); break; default: - rbd_assert(0); + BUG(); } if (ret < 0) return ret; @@ -2515,7 +2515,7 @@ static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes) num_osd_ops += count_zeroout_ops(obj_req); break; default: - rbd_assert(0); + BUG(); } obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); @@ -2542,7 +2542,7 @@ static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes) __rbd_obj_setup_zeroout(obj_req, which); break; default: - rbd_assert(0); + BUG(); } ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO); From 0384892c2d839033566b53453be44b1f5812cd00 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 25 Mar 2019 13:51:43 +0100 Subject: [PATCH 07/29] libceph: fix clang warning for CEPH_DEFINE_OID_ONSTACK clang complains about assigning a variable to itself during the declaration: fs/ceph/ioctl.c:187:26: error: variable 'oid' is uninitialized when used within its own initialization [-Werror,-Wuninitialized] CEPH_DEFINE_OID_ONSTACK(oid); ^~~ include/linux/ceph/osdmap.h:122:52: note: expanded from macro 'CEPH_DEFINE_OID_ONSTACK' struct ceph_object_id oid = CEPH_OID_INIT_ONSTACK(oid) ~~~ ^~~ include/linux/ceph/osdmap.h:120:29: note: expanded from macro 'CEPH_OID_INIT_ONSTACK' ({ ceph_oid_init(&oid); oid; }) ^~~ We use this trick in other places, but it is completely unnecessary here, as we can just use a regular struct initializer. Signed-off-by: Arnd Bergmann Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- include/linux/ceph/osdmap.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 5675b1f09bc5..e081b56f1c1d 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -110,17 +110,16 @@ struct ceph_object_id { int name_len; }; +#define __CEPH_OID_INITIALIZER(oid) { .name = (oid).inline_name } + +#define CEPH_DEFINE_OID_ONSTACK(oid) \ + struct ceph_object_id oid = __CEPH_OID_INITIALIZER(oid) + static inline void ceph_oid_init(struct ceph_object_id *oid) { - oid->name = oid->inline_name; - oid->name_len = 0; + *oid = (struct ceph_object_id) __CEPH_OID_INITIALIZER(*oid); } -#define CEPH_OID_INIT_ONSTACK(oid) \ - ({ ceph_oid_init(&oid); oid; }) -#define CEPH_DEFINE_OID_ONSTACK(oid) \ - struct ceph_object_id oid = CEPH_OID_INIT_ONSTACK(oid) - static inline bool ceph_oid_empty(const struct ceph_object_id *oid) { return oid->name == oid->inline_name && !oid->name_len; From ffb61c55b2501c3dcd266856e25430ae8e3753d6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 8 Apr 2019 10:36:01 -0400 Subject: [PATCH 08/29] ceph: remove superfluous inode_lock in ceph_fsync Originally, filemap_write_and_wait took the i_mutex internally, but commit 02c24a82187d pushed the mutex acquisition into the individual fsync routines, leaving it up to the subsystem maintainers to remove it if it wasn't needed. For ceph, I see no reason to take the inode_lock here. All of the operations inside that lock are protected by their own locking. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 36a8dc699448..f976939f771f 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2257,8 +2257,6 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (datasync) goto out; - inode_lock(inode); - dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); @@ -2273,7 +2271,6 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); } - inode_unlock(inode); out: dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); return ret; From 428bb68ad99b9f03c35d7b34d60be54faf954181 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 11 Apr 2019 15:27:03 -0400 Subject: [PATCH 09/29] ceph: properly handle granular statx requests cephfs can benefit from statx. We can have the client just request caps sufficient for the needed attributes and leave off the rest. Also, recognize when AT_STATX_DONT_SYNC is set, and just scrape the inode without doing any call in that case. Force a call to the MDS in the event that AT_STATX_FORCE_SYNC is set. Link: http://tracker.ceph.com/issues/39258 Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Reviewed-by: David Howells Reviewed-by: Sage Weil Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 87 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 58 insertions(+), 29 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index c2feb310ac1e..cf50d835b93f 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2269,43 +2269,72 @@ int ceph_permission(struct inode *inode, int mask) return err; } +/* Craft a mask of needed caps given a set of requested statx attrs. */ +static int statx_to_caps(u32 want) +{ + int mask = 0; + + if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME)) + mask |= CEPH_CAP_AUTH_SHARED; + + if (want & (STATX_NLINK|STATX_CTIME)) + mask |= CEPH_CAP_LINK_SHARED; + + if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE| + STATX_BLOCKS)) + mask |= CEPH_CAP_FILE_SHARED; + + if (want & (STATX_CTIME)) + mask |= CEPH_CAP_XATTR_SHARED; + + return mask; +} + /* - * Get all attributes. Hopefully somedata we'll have a statlite() - * and can limit the fields we require to be accurate. + * Get all the attributes. If we have sufficient caps for the requested attrs, + * then we can avoid talking to the MDS at all. */ int ceph_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); struct ceph_inode_info *ci = ceph_inode(inode); - int err; + int err = 0; - err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL, false); - if (!err) { - generic_fillattr(inode, stat); - stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino); - if (ceph_snap(inode) == CEPH_NOSNAP) - stat->dev = inode->i_sb->s_dev; - else - stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0; - - if (S_ISDIR(inode->i_mode)) { - if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), - RBYTES)) - stat->size = ci->i_rbytes; - else - stat->size = ci->i_files + ci->i_subdirs; - stat->blocks = 0; - stat->blksize = 65536; - /* - * Some applications rely on the number of st_nlink - * value on directories to be either 0 (if unlinked) - * or 2 + number of subdirectories. - */ - if (stat->nlink == 1) - /* '.' + '..' + subdirs */ - stat->nlink = 1 + 1 + ci->i_subdirs; - } + /* Skip the getattr altogether if we're asked not to sync */ + if (!(flags & AT_STATX_DONT_SYNC)) { + err = ceph_do_getattr(inode, statx_to_caps(request_mask), + flags & AT_STATX_FORCE_SYNC); + if (err) + return err; } + + generic_fillattr(inode, stat); + stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino); + if (ceph_snap(inode) == CEPH_NOSNAP) + stat->dev = inode->i_sb->s_dev; + else + stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0; + + if (S_ISDIR(inode->i_mode)) { + if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), + RBYTES)) + stat->size = ci->i_rbytes; + else + stat->size = ci->i_files + ci->i_subdirs; + stat->blocks = 0; + stat->blksize = 65536; + /* + * Some applications rely on the number of st_nlink + * value on directories to be either 0 (if unlinked) + * or 2 + number of subdirectories. + */ + if (stat->nlink == 1) + /* '.' + '..' + subdirs */ + stat->nlink = 1 + 1 + ci->i_subdirs; + } + + /* Mask off any higher bits (e.g. btime) until we have support */ + stat->result_mask = request_mask & STATX_BASIC_STATS; return err; } From 40e7e2c0e86464bca839cdf891bd58a6d41b60b4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 23 Apr 2019 14:18:45 -0400 Subject: [PATCH 10/29] ceph: fix NULL pointer deref when debugging is enabled Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 9f53c3d99304..7ae0f49349e3 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -928,7 +928,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n", (write ? "write" : "read"), file, pos, (unsigned)count, - snapc, snapc->seq); + snapc, snapc ? snapc->seq : 0); ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count - 1); From f5d7726900b66e38355db878ced6b13b00fa9201 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 24 Apr 2019 12:09:04 -0400 Subject: [PATCH 11/29] ceph: make iterate_session_caps a public symbol Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 16 ++++++++-------- fs/ceph/mds_client.h | 4 ++++ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 5dee98b4cfde..761cb669aa13 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1284,9 +1284,9 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, * * Caller must hold session s_mutex. */ -static int iterate_session_caps(struct ceph_mds_session *session, - int (*cb)(struct inode *, struct ceph_cap *, - void *), void *arg) +int ceph_iterate_session_caps(struct ceph_mds_session *session, + int (*cb)(struct inode *, struct ceph_cap *, + void *), void *arg) { struct list_head *p; struct ceph_cap *cap; @@ -1451,7 +1451,7 @@ static void remove_session_caps(struct ceph_mds_session *session) LIST_HEAD(dispose); dout("remove_session_caps on %p\n", session); - iterate_session_caps(session, remove_session_caps_cb, fsc); + ceph_iterate_session_caps(session, remove_session_caps_cb, fsc); wake_up_all(&fsc->mdsc->cap_flushing_wq); @@ -1534,8 +1534,8 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, static void wake_up_session_caps(struct ceph_mds_session *session, int ev) { dout("wake_up_session_caps %p mds%d\n", session, session->s_mds); - iterate_session_caps(session, wake_up_session_cb, - (void *)(unsigned long)ev); + ceph_iterate_session_caps(session, wake_up_session_cb, + (void *)(unsigned long)ev); } /* @@ -1768,7 +1768,7 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc, session->s_mds, session->s_nr_caps, max_caps, trim_caps); if (trim_caps > 0) { session->s_trim_caps = trim_caps; - iterate_session_caps(session, trim_caps_cb, session); + ceph_iterate_session_caps(session, trim_caps_cb, session); dout("trim_caps mds%d done: %d / %d, trimmed %d\n", session->s_mds, session->s_nr_caps, max_caps, trim_caps - session->s_trim_caps); @@ -3642,7 +3642,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, recon_state.msg_version = 2; } /* trsaverse this session's caps */ - err = iterate_session_caps(session, encode_caps_cb, &recon_state); + err = ceph_iterate_session_caps(session, encode_caps_cb, &recon_state); spin_lock(&session->s_cap_lock); session->s_cap_reconnect = 0; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 3f0029aa8a39..0d1f673a5689 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -486,6 +486,10 @@ extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc); extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr); +extern int ceph_iterate_session_caps(struct ceph_mds_session *session, + int (*cb)(struct inode *, + struct ceph_cap *, void *), + void *arg); extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, From ff4a80bf2d3f8005dc5890381bc8ca48e259c60d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 24 Apr 2019 12:05:15 -0400 Subject: [PATCH 12/29] ceph: dump granular cap info in "caps" debugfs file We have a "caps" file already that gives statistics on the caps cache as a whole. Add another section to that output and dump a line for each individual cap record. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/debugfs.c | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 98365e74cb4a..777f6ceb5259 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -124,18 +124,48 @@ static int mdsc_show(struct seq_file *s, void *p) return 0; } +static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p) +{ + struct seq_file *s = p; + + seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino, + ceph_cap_string(cap->issued), + ceph_cap_string(cap->implemented)); + return 0; +} + static int caps_show(struct seq_file *s, void *p) { struct ceph_fs_client *fsc = s->private; - int total, avail, used, reserved, min; + struct ceph_mds_client *mdsc = fsc->mdsc; + int total, avail, used, reserved, min, i; ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min); seq_printf(s, "total\t\t%d\n" "avail\t\t%d\n" "used\t\t%d\n" "reserved\t%d\n" - "min\t%d\n", + "min\t\t%d\n\n", total, avail, used, reserved, min); + seq_printf(s, "ino issued implemented\n"); + seq_printf(s, "-----------------------------------------------\n"); + + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + struct ceph_mds_session *session; + + session = __ceph_lookup_mds_session(mdsc, i); + if (!session) + continue; + mutex_unlock(&mdsc->mutex); + mutex_lock(&session->s_mutex); + ceph_iterate_session_caps(session, caps_show_cb, s); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + mutex_lock(&mdsc->mutex); + } + mutex_unlock(&mdsc->mutex); + return 0; } From 69a10fb3f4b8769ffd44e4eaa662ab691fa61f4c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 26 Apr 2019 13:33:39 -0400 Subject: [PATCH 13/29] ceph: fix potential use-after-free in ceph_mdsc_build_path temp is not defined outside of the RCU critical section here. Ensure we grab that value before we drop the rcu_read_lock. Reported-by: Al Viro Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 761cb669aa13..b01e2043b1b2 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2089,13 +2089,14 @@ static inline u64 __get_oldest_tid(struct ceph_mds_client *mdsc) * Encode hidden .snap dirs as a double /, i.e. * foo/.snap/bar -> foo//bar */ -char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, +char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *pbase, int stop_on_nosnap) { struct dentry *temp; char *path; int len, pos; unsigned seq; + u64 base; if (!dentry) return ERR_PTR(-EINVAL); @@ -2151,6 +2152,7 @@ retry: path[--pos] = '/'; temp = temp->d_parent; } + base = ceph_ino(d_inode(temp)); rcu_read_unlock(); if (pos != 0 || read_seqretry(&rename_lock, seq)) { pr_err("build_path did not end path lookup where " @@ -2163,10 +2165,10 @@ retry: goto retry; } - *base = ceph_ino(d_inode(temp)); + *pbase = base; *plen = len; dout("build_path on %p %d built %llx '%.*s'\n", - dentry, d_count(dentry), *base, len, path); + dentry, d_count(dentry), base, len, path); return path; } From 964fff7491e4923e18ff08f2a254c4b94e3f83d6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 29 Apr 2019 11:51:02 -0400 Subject: [PATCH 14/29] ceph: use ceph_mdsc_build_path instead of clone_dentry_name While it may be slightly more efficient, it's probably not worthwhile to optimize for the case that clone_dentry_name handles. We can get the same result by just calling ceph_mdsc_build_path when the parent isn't locked, with less code duplication. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 42 +++--------------------------------------- 1 file changed, 3 insertions(+), 39 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index b01e2043b1b2..7af722834348 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2172,56 +2172,20 @@ retry: return path; } -/* Duplicate the dentry->d_name.name safely */ -static int clone_dentry_name(struct dentry *dentry, const char **ppath, - int *ppathlen) -{ - u32 len; - char *name; - -retry: - len = READ_ONCE(dentry->d_name.len); - name = kmalloc(len + 1, GFP_NOFS); - if (!name) - return -ENOMEM; - - spin_lock(&dentry->d_lock); - if (dentry->d_name.len != len) { - spin_unlock(&dentry->d_lock); - kfree(name); - goto retry; - } - memcpy(name, dentry->d_name.name, len); - spin_unlock(&dentry->d_lock); - - name[len] = '\0'; - *ppath = name; - *ppathlen = len; - return 0; -} - static int build_dentry_path(struct dentry *dentry, struct inode *dir, const char **ppath, int *ppathlen, u64 *pino, bool *pfreepath, bool parent_locked) { - int ret; char *path; rcu_read_lock(); if (!dir) dir = d_inode_rcu(dentry->d_parent); - if (dir && ceph_snap(dir) == CEPH_NOSNAP) { + if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP) { *pino = ceph_ino(dir); rcu_read_unlock(); - if (parent_locked) { - *ppath = dentry->d_name.name; - *ppathlen = dentry->d_name.len; - } else { - ret = clone_dentry_name(dentry, ppath, ppathlen); - if (ret) - return ret; - *pfreepath = true; - } + *ppath = dentry->d_name.name; + *ppathlen = dentry->d_name.len; return 0; } rcu_read_unlock(); From f77f21bb28367d0ac4861a24da1db118bba850e6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 29 Apr 2019 12:13:14 -0400 Subject: [PATCH 15/29] ceph: use __getname/__putname in ceph_mdsc_build_path Al suggested we get rid of the kmalloc here and just use __getname and __putname to get a full PATH_MAX pathname buffer. Since we build the path in reverse, we continue to return a pointer to the beginning of the string and the length, and add a new helper to free the thing at the end. Suggested-by: Al Viro Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/debugfs.c | 4 +-- fs/ceph/mds_client.c | 65 +++++++++++++++++++------------------------- fs/ceph/mds_client.h | 6 ++++ 3 files changed, 36 insertions(+), 39 deletions(-) diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 777f6ceb5259..b014fc7d4e3c 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -88,7 +88,7 @@ static int mdsc_show(struct seq_file *s, void *p) req->r_dentry, path ? path : ""); spin_unlock(&req->r_dentry->d_lock); - kfree(path); + ceph_mdsc_free_path(path, pathlen); } else if (req->r_path1) { seq_printf(s, " #%llx/%s", req->r_ino1.ino, req->r_path1); @@ -108,7 +108,7 @@ static int mdsc_show(struct seq_file *s, void *p) req->r_old_dentry, path ? path : ""); spin_unlock(&req->r_old_dentry->d_lock); - kfree(path); + ceph_mdsc_free_path(path, pathlen); } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) { if (req->r_ino2.ino) seq_printf(s, " #%llx/%s", req->r_ino2.ino, diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 7af722834348..d29f6c094f7c 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2094,39 +2094,24 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *pbase, { struct dentry *temp; char *path; - int len, pos; + int pos; unsigned seq; u64 base; if (!dentry) return ERR_PTR(-EINVAL); -retry: - len = 0; - seq = read_seqbegin(&rename_lock); - rcu_read_lock(); - for (temp = dentry; !IS_ROOT(temp);) { - struct inode *inode = d_inode(temp); - if (inode && ceph_snap(inode) == CEPH_SNAPDIR) - len++; /* slash only */ - else if (stop_on_nosnap && inode && - ceph_snap(inode) == CEPH_NOSNAP) - break; - else - len += 1 + temp->d_name.len; - temp = temp->d_parent; - } - rcu_read_unlock(); - if (len) - len--; /* no leading '/' */ - - path = kmalloc(len+1, GFP_NOFS); + path = __getname(); if (!path) return ERR_PTR(-ENOMEM); - pos = len; - path[pos] = 0; /* trailing null */ +retry: + pos = PATH_MAX - 1; + path[pos] = '\0'; + + seq = read_seqbegin(&rename_lock); rcu_read_lock(); - for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) { + temp = dentry; + for (;;) { struct inode *inode; spin_lock(&temp->d_lock); @@ -2144,32 +2129,38 @@ retry: spin_unlock(&temp->d_lock); break; } - strncpy(path + pos, temp->d_name.name, - temp->d_name.len); + memcpy(path + pos, temp->d_name.name, temp->d_name.len); } spin_unlock(&temp->d_lock); - if (pos) - path[--pos] = '/'; temp = temp->d_parent; + + /* Are we at the root? */ + if (IS_ROOT(temp)) + break; + + /* Are we out of buffer? */ + if (--pos < 0) + break; + + path[pos] = '/'; } base = ceph_ino(d_inode(temp)); rcu_read_unlock(); - if (pos != 0 || read_seqretry(&rename_lock, seq)) { + if (pos < 0 || read_seqretry(&rename_lock, seq)) { pr_err("build_path did not end path lookup where " - "expected, namelen is %d, pos is %d\n", len, pos); + "expected, pos is %d\n", pos); /* presumably this is only possible if racing with a rename of one of the parent directories (we can not lock the dentries above us to prevent this, but retrying should be harmless) */ - kfree(path); goto retry; } *pbase = base; - *plen = len; + *plen = PATH_MAX - 1 - pos; dout("build_path on %p %d built %llx '%.*s'\n", - dentry, d_count(dentry), base, len, path); - return path; + dentry, d_count(dentry), base, *plen, path + pos); + return path + pos; } static int build_dentry_path(struct dentry *dentry, struct inode *dir, @@ -2376,10 +2367,10 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, out_free2: if (freepath2) - kfree((char *)path2); + ceph_mdsc_free_path((char *)path2, pathlen2); out_free1: if (freepath1) - kfree((char *)path1); + ceph_mdsc_free_path((char *)path1, pathlen1); out: return msg; } @@ -3451,7 +3442,7 @@ out_freeflocks: ceph_pagelist_encode_string(pagelist, path, pathlen); ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1)); out_freepath: - kfree(path); + ceph_mdsc_free_path(path, pathlen); } out_err: diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 0d1f673a5689..ebcad5afc87b 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -492,6 +492,12 @@ extern int ceph_iterate_session_caps(struct ceph_mds_session *session, void *arg); extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); +static inline void ceph_mdsc_free_path(char *path, int len) +{ + if (path) + __putname(path - (PATH_MAX - 1 - len)); +} + extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, int stop_on_nosnap); From c1dfc277239c73f68a6af6979acec1989a5e6864 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 17 Apr 2019 14:23:17 -0400 Subject: [PATCH 16/29] ceph: use pathlen values returned by set_request_path_attr We make copies of the dentry name in set_request_path_attr, but then create_request_message re-fetches the lengths out of the dentry. While we don't currently set the *_drop fields unless the parents are locked, it's still better not to rely on that sort of implicit assumption. Use the pathlen values that set_request_path_attr returned instead, as they will always be correct for the returned paths themselves. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d29f6c094f7c..bfa1733c6336 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2288,9 +2288,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, (!!req->r_inode_drop + !!req->r_dentry_drop + !!req->r_old_inode_drop + !!req->r_old_dentry_drop); if (req->r_dentry_drop) - len += req->r_dentry->d_name.len; + len += pathlen1; if (req->r_old_dentry_drop) - len += req->r_old_dentry->d_name.len; + len += pathlen2; msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false); if (!msg) { From 111c708104506d53bb1845c782cfd98157471e32 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 2 Apr 2019 09:43:18 -0400 Subject: [PATCH 17/29] ceph: after an MDS request, do callback and completions No MDS requests use r_callback today, but that will change in the future. The OSD client always does r_callback and then completes r_completion. Let's have the MDS client do the same. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index bfa1733c6336..b451ec761290 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2384,8 +2384,7 @@ static void complete_request(struct ceph_mds_client *mdsc, { if (req->r_callback) req->r_callback(mdsc, req); - else - complete_all(&req->r_completion); + complete_all(&req->r_completion); } /* From 86bda539fa90184ca404afb38cd015416bf81d15 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 2 Apr 2019 09:24:36 -0400 Subject: [PATCH 18/29] ceph: have ceph_mdsc_do_request call ceph_mdsc_submit_request Nothing calls ceph_mdsc_submit_request today, but in later patches we'll need to be able to call this separately. Have the helper return an int so we can check the r_err under the mutex, and have the caller just check the error code from the submit. Also move the acquisition of CEPH_CAP_PIN references into the same function. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 40 +++++++++++++++++++--------------------- fs/ceph/mds_client.h | 5 +++-- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index b451ec761290..ffbb98fdc478 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2626,14 +2626,27 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds) } } -void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, +int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req) { - dout("submit_request on %p\n", req); + int err; + + /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */ + if (req->r_inode) + ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); + if (req->r_parent) + ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); + if (req->r_old_dentry_dir) + ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), + CEPH_CAP_PIN); + + dout("submit_request on %p for inode %p\n", req, dir); mutex_lock(&mdsc->mutex); - __register_request(mdsc, req, NULL); + __register_request(mdsc, req, dir); __do_request(mdsc, req); + err = req->r_err; mutex_unlock(&mdsc->mutex); + return err; } /* @@ -2648,27 +2661,12 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, dout("do_request on %p\n", req); - /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */ - if (req->r_inode) - ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); - if (req->r_parent) - ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); - if (req->r_old_dentry_dir) - ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), - CEPH_CAP_PIN); - /* issue */ - mutex_lock(&mdsc->mutex); - __register_request(mdsc, req, dir); - __do_request(mdsc, req); - - if (req->r_err) { - err = req->r_err; + err = ceph_mdsc_submit_request(mdsc, dir, req); + if (err) goto out; - } /* wait */ - mutex_unlock(&mdsc->mutex); dout("do_request waiting\n"); if (!req->r_timeout && req->r_wait_for_completion) { err = req->r_wait_for_completion(mdsc, req); @@ -2709,8 +2707,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, err = req->r_err; } -out: mutex_unlock(&mdsc->mutex); +out: dout("do_request %p done, result %d\n", req, err); return err; } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index ebcad5afc87b..a83f28bc2387 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -465,8 +465,9 @@ extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, struct inode *dir); extern struct ceph_mds_request * ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); -extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, - struct ceph_mds_request *req); +extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, + struct inode *dir, + struct ceph_mds_request *req); extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req); From 8340f22ce50c7c2f9b05e0875891dcc44232dce4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 2 Apr 2019 12:34:38 -0400 Subject: [PATCH 19/29] ceph: move wait for mds request into helper function Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index ffbb98fdc478..0026ca094e22 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2649,23 +2649,11 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, return err; } -/* - * Synchrously perform an mds request. Take care of all of the - * session setup, forwarding, retry details. - */ -int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, - struct inode *dir, - struct ceph_mds_request *req) +static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) { int err; - dout("do_request on %p\n", req); - - /* issue */ - err = ceph_mdsc_submit_request(mdsc, dir, req); - if (err) - goto out; - /* wait */ dout("do_request waiting\n"); if (!req->r_timeout && req->r_wait_for_completion) { @@ -2708,7 +2696,25 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, } mutex_unlock(&mdsc->mutex); -out: + return err; +} + +/* + * Synchrously perform an mds request. Take care of all of the + * session setup, forwarding, retry details. + */ +int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, + struct inode *dir, + struct ceph_mds_request *req) +{ + int err; + + dout("do_request on %p\n", req); + + /* issue */ + err = ceph_mdsc_submit_request(mdsc, dir, req); + if (!err) + err = ceph_mdsc_wait_request(mdsc, req); dout("do_request %p done, result %d\n", req, err); return err; } From a452bc0636728b8c12632ae4b5f4ddf39cbe39c1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 2 Apr 2019 14:20:24 -0400 Subject: [PATCH 20/29] ceph: fix comment over ceph_drop_caps_for_unlink It's not clear what AUTH_RDCACHE means in this context, and we're clearly just dropping LINK caps here. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index f976939f771f..90090a56899e 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4096,7 +4096,7 @@ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode) } /* - * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it + * For a soon-to-be unlinked file, drop the LINK caps. If it * looks like the link count will hit 0, drop any other caps (other * than PIN) we don't specifically want (due to the file still being * open). From 1199d7da2d29dac5e3983ea1078dbd4ab107e33f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 2 Apr 2019 15:58:05 -0400 Subject: [PATCH 21/29] ceph: simplify arguments and return semantics of try_get_cap_refs The return of this function is rather complex. It can return 0 or 1, and in the case of a 1 return, the "err" pointer will be filled out. This necessitates a lot of copying of values. We can achieve the same effect by just returning 0, 1 or a negative error code, and drop the "err" argument from this function. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 76 ++++++++++++++++++++------------------------------ 1 file changed, 30 insertions(+), 46 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 90090a56899e..9e0b464d374f 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2525,9 +2525,14 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got, * to (when applicable), and check against max_size here as well. * Note that caller is responsible for ensuring max_size increases are * requested from the MDS. + * + * Returns 0 if caps were not able to be acquired (yet), a 1 if they were, + * or a negative error code. + * + * FIXME: how does a 0 return differ from -EAGAIN? */ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, - loff_t endoff, bool nonblock, int *got, int *err) + loff_t endoff, bool nonblock, int *got) { struct inode *inode = &ci->vfs_inode; struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; @@ -2547,8 +2552,7 @@ again: if ((file_wanted & need) != need) { dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", ceph_cap_string(need), ceph_cap_string(file_wanted)); - *err = -EBADF; - ret = 1; + ret = -EBADF; goto out_unlock; } @@ -2569,10 +2573,8 @@ again: if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { dout("get_cap_refs %p endoff %llu > maxsize %llu\n", inode, endoff, ci->i_max_size); - if (endoff > ci->i_requested_max_size) { - *err = -EAGAIN; - ret = 1; - } + if (endoff > ci->i_requested_max_size) + ret = -EAGAIN; goto out_unlock; } /* @@ -2607,8 +2609,7 @@ again: * task isn't in TASK_RUNNING state */ if (nonblock) { - *err = -EAGAIN; - ret = 1; + ret = -EAGAIN; goto out_unlock; } @@ -2637,8 +2638,7 @@ again: if (session_readonly) { dout("get_cap_refs %p needed %s but mds%d readonly\n", inode, ceph_cap_string(need), ci->i_auth_cap->mds); - *err = -EROFS; - ret = 1; + ret = -EROFS; goto out_unlock; } @@ -2647,16 +2647,14 @@ again: if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { dout("get_cap_refs %p forced umount\n", inode); - *err = -EIO; - ret = 1; + ret = -EIO; goto out_unlock; } mds_wanted = __ceph_caps_mds_wanted(ci, false); if (need & ~(mds_wanted & need)) { dout("get_cap_refs %p caps were dropped" " (session killed?)\n", inode); - *err = -ESTALE; - ret = 1; + ret = -ESTALE; goto out_unlock; } if (!(file_wanted & ~mds_wanted)) @@ -2707,7 +2705,7 @@ static void check_max_size(struct inode *inode, loff_t endoff) int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, bool nonblock, int *got) { - int ret, err = 0; + int ret; BUG_ON(need & ~CEPH_CAP_FILE_RD); BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED)); @@ -2715,15 +2713,8 @@ int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, if (ret < 0) return ret; - ret = try_get_cap_refs(ci, need, want, 0, nonblock, got, &err); - if (ret) { - if (err == -EAGAIN) { - ret = 0; - } else if (err < 0) { - ret = err; - } - } - return ret; + ret = try_get_cap_refs(ci, need, want, 0, nonblock, got); + return ret == -EAGAIN ? 0 : ret; } /* @@ -2734,7 +2725,7 @@ int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, loff_t endoff, int *got, struct page **pinned_page) { - int _got, ret, err = 0; + int _got, ret; ret = ceph_pool_perm_check(ci, need); if (ret < 0) @@ -2744,21 +2735,19 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, if (endoff > 0) check_max_size(&ci->vfs_inode, endoff); - err = 0; _got = 0; ret = try_get_cap_refs(ci, need, want, endoff, - false, &_got, &err); - if (ret) { - if (err == -EAGAIN) - continue; - if (err < 0) - ret = err; - } else { + false, &_got); + if (ret == -EAGAIN) { + continue; + } else if (!ret) { + int err; + DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(&ci->i_cap_wq, &wait); - while (!try_get_cap_refs(ci, need, want, endoff, - true, &_got, &err)) { + while (!(err = try_get_cap_refs(ci, need, want, endoff, + true, &_got))) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; @@ -2767,19 +2756,14 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, } remove_wait_queue(&ci->i_cap_wq, &wait); - if (err == -EAGAIN) continue; - if (err < 0) - ret = err; } - if (ret < 0) { - if (err == -ESTALE) { - /* session was killed, try renew caps */ - ret = ceph_renew_caps(&ci->vfs_inode); - if (ret == 0) - continue; - } + if (ret == -ESTALE) { + /* session was killed, try renew caps */ + ret = ceph_renew_caps(&ci->vfs_inode); + if (ret == 0) + continue; return ret; } From 488f5284e29b2dbb7c224d51b56fc20348e73735 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 24 Apr 2019 10:05:39 -0400 Subject: [PATCH 22/29] ceph: just call get_session in __ceph_lookup_mds_session I originally thought there was a potential race here, but the fact that this is called with the mdsc->mutex held, ensures that the last reference to the session can't be put here. Still, it's clearer to just return the value from get_session here, and may prevent a bug later if we ever rework this code to be less reliant on mutexes. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0026ca094e22..5f3f375cf0e9 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -550,15 +550,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s) struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc, int mds) { - struct ceph_mds_session *session; - if (mds >= mdsc->max_sessions || !mdsc->sessions[mds]) return NULL; - session = mdsc->sessions[mds]; - dout("lookup_mds_session %p %d\n", session, - refcount_read(&session->s_ref)); - get_session(session); - return session; + return get_session(mdsc->sessions[mds]); } static bool __have_session(struct ceph_mds_client *mdsc, int mds) From 5ddc61fc145861e4e542c9273a4088b627ba9b8d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 23 Apr 2019 13:40:02 -0400 Subject: [PATCH 23/29] ceph: print inode number in __caps_issued_mask debugging messages To make it easier to correlate with MDS logs. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 9e0b464d374f..72f8e1311392 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -892,8 +892,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) int have = ci->i_snap_caps; if ((have & mask) == mask) { - dout("__ceph_caps_issued_mask %p snap issued %s" - " (mask %s)\n", &ci->vfs_inode, + dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s" + " (mask %s)\n", ci->vfs_inode.i_ino, ceph_cap_string(have), ceph_cap_string(mask)); return 1; @@ -904,8 +904,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) if (!__cap_is_valid(cap)) continue; if ((cap->issued & mask) == mask) { - dout("__ceph_caps_issued_mask %p cap %p issued %s" - " (mask %s)\n", &ci->vfs_inode, cap, + dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s" + " (mask %s)\n", ci->vfs_inode.i_ino, cap, ceph_cap_string(cap->issued), ceph_cap_string(mask)); if (touch) @@ -916,8 +916,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) /* does a combination of caps satisfy mask? */ have |= cap->issued; if ((have & mask) == mask) { - dout("__ceph_caps_issued_mask %p combo issued %s" - " (mask %s)\n", &ci->vfs_inode, + dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s" + " (mask %s)\n", ci->vfs_inode.i_ino, ceph_cap_string(cap->issued), ceph_cap_string(mask)); if (touch) { From a32e414325c2f0d430436e4708a33c756b082fd8 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 2 May 2019 15:56:00 +0200 Subject: [PATCH 24/29] rbd: client_mutex is never nested Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 612e4aecfa7f..99de7166bf89 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -934,7 +934,7 @@ static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) struct rbd_client *rbdc; int ret; - mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); + mutex_lock(&client_mutex); rbdc = rbd_client_find(ceph_opts); if (rbdc) { ceph_destroy_options(ceph_opts); From b91a7bdca4439e286f26cdd6c15ed338e6a9fda2 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 3 May 2019 17:27:03 +0200 Subject: [PATCH 25/29] rbd: don't assert on writes to snapshots The check added in commit 721c7fc701c7 ("block: fail op_is_write() requests to read-only partitions") was lifted in commit a32e236eb93e ("Partially revert "block: fail op_is_write() requests to read-only partitions""). Basic things like user triggered writes and discards are still caught, but internal kernel users can submit anything. In particular, ext4 will attempt to write to the superblock if it detects errors in the filesystem, even if the filesystem is mounted read-only on a read-only partition. The assert is overkill regardless. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 99de7166bf89..e5009a34f9c2 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3842,8 +3842,12 @@ static void rbd_queue_workfn(struct work_struct *work) goto err_rq; } - rbd_assert(op_type == OBJ_OP_READ || - rbd_dev->spec->snap_id == CEPH_NOSNAP); + if (op_type != OBJ_OP_READ && rbd_dev->spec->snap_id != CEPH_NOSNAP) { + rbd_warn(rbd_dev, "%s on read-only snapshot", + obj_op_name(op_type)); + result = -EIO; + goto err; + } /* * Quit early if the mapped snapshot no longer exists. It's From cede185b1ba3118e1912385db4812a37d9e9b205 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 6 May 2019 09:38:46 -0400 Subject: [PATCH 26/29] libceph: fix unaligned accesses in ceph_entity_addr handling GCC9 is throwing a lot of warnings about unaligned access. This patch fixes some of them by changing most of the sockaddr handling functions to take a pointer to struct ceph_entity_addr instead of struct sockaddr_storage. The lower functions can then make copies or do unaligned accesses as needed. Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/messenger.c | 77 +++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 40 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 3083988ce729..3e0cc9808ae4 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -449,7 +449,7 @@ static void set_sock_callbacks(struct socket *sock, */ static int ceph_tcp_connect(struct ceph_connection *con) { - struct sockaddr_storage *paddr = &con->peer_addr.in_addr; + struct sockaddr_storage ss = con->peer_addr.in_addr; /* align */ struct socket *sock; unsigned int noio_flag; int ret; @@ -458,7 +458,7 @@ static int ceph_tcp_connect(struct ceph_connection *con) /* sock_create_kern() allocates with GFP_KERNEL */ noio_flag = memalloc_noio_save(); - ret = sock_create_kern(read_pnet(&con->msgr->net), paddr->ss_family, + ret = sock_create_kern(read_pnet(&con->msgr->net), ss.ss_family, SOCK_STREAM, IPPROTO_TCP, &sock); memalloc_noio_restore(noio_flag); if (ret) @@ -474,7 +474,7 @@ static int ceph_tcp_connect(struct ceph_connection *con) dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr)); con_sock_state_connecting(con); - ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), + ret = sock->ops->connect(sock, (struct sockaddr *)&ss, sizeof(ss), O_NONBLOCK); if (ret == -EINPROGRESS) { dout("connect %s EINPROGRESS sk_state = %u\n", @@ -1795,14 +1795,15 @@ static int verify_hello(struct ceph_connection *con) return 0; } -static bool addr_is_blank(struct sockaddr_storage *ss) +static bool addr_is_blank(struct ceph_entity_addr *addr) { - struct in_addr *addr = &((struct sockaddr_in *)ss)->sin_addr; - struct in6_addr *addr6 = &((struct sockaddr_in6 *)ss)->sin6_addr; + struct sockaddr_storage ss = addr->in_addr; /* align */ + struct in_addr *addr4 = &((struct sockaddr_in *)&ss)->sin_addr; + struct in6_addr *addr6 = &((struct sockaddr_in6 *)&ss)->sin6_addr; - switch (ss->ss_family) { + switch (ss.ss_family) { case AF_INET: - return addr->s_addr == htonl(INADDR_ANY); + return addr4->s_addr == htonl(INADDR_ANY); case AF_INET6: return ipv6_addr_any(addr6); default: @@ -1810,25 +1811,25 @@ static bool addr_is_blank(struct sockaddr_storage *ss) } } -static int addr_port(struct sockaddr_storage *ss) +static int addr_port(struct ceph_entity_addr *addr) { - switch (ss->ss_family) { + switch (get_unaligned(&addr->in_addr.ss_family)) { case AF_INET: - return ntohs(((struct sockaddr_in *)ss)->sin_port); + return ntohs(get_unaligned(&((struct sockaddr_in *)&addr->in_addr)->sin_port)); case AF_INET6: - return ntohs(((struct sockaddr_in6 *)ss)->sin6_port); + return ntohs(get_unaligned(&((struct sockaddr_in6 *)&addr->in_addr)->sin6_port)); } return 0; } -static void addr_set_port(struct sockaddr_storage *ss, int p) +static void addr_set_port(struct ceph_entity_addr *addr, int p) { - switch (ss->ss_family) { + switch (get_unaligned(&addr->in_addr.ss_family)) { case AF_INET: - ((struct sockaddr_in *)ss)->sin_port = htons(p); + put_unaligned(htons(p), &((struct sockaddr_in *)&addr->in_addr)->sin_port); break; case AF_INET6: - ((struct sockaddr_in6 *)ss)->sin6_port = htons(p); + put_unaligned(htons(p), &((struct sockaddr_in6 *)&addr->in_addr)->sin6_port); break; } } @@ -1836,21 +1837,18 @@ static void addr_set_port(struct sockaddr_storage *ss, int p) /* * Unlike other *_pton function semantics, zero indicates success. */ -static int ceph_pton(const char *str, size_t len, struct sockaddr_storage *ss, +static int ceph_pton(const char *str, size_t len, struct ceph_entity_addr *addr, char delim, const char **ipend) { - struct sockaddr_in *in4 = (struct sockaddr_in *) ss; - struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss; + memset(&addr->in_addr, 0, sizeof(addr->in_addr)); - memset(ss, 0, sizeof(*ss)); - - if (in4_pton(str, len, (u8 *)&in4->sin_addr.s_addr, delim, ipend)) { - ss->ss_family = AF_INET; + if (in4_pton(str, len, (u8 *)&((struct sockaddr_in *)&addr->in_addr)->sin_addr.s_addr, delim, ipend)) { + put_unaligned(AF_INET, &addr->in_addr.ss_family); return 0; } - if (in6_pton(str, len, (u8 *)&in6->sin6_addr.s6_addr, delim, ipend)) { - ss->ss_family = AF_INET6; + if (in6_pton(str, len, (u8 *)&((struct sockaddr_in6 *)&addr->in_addr)->sin6_addr.s6_addr, delim, ipend)) { + put_unaligned(AF_INET6, &addr->in_addr.ss_family); return 0; } @@ -1862,7 +1860,7 @@ static int ceph_pton(const char *str, size_t len, struct sockaddr_storage *ss, */ #ifdef CONFIG_CEPH_LIB_USE_DNS_RESOLVER static int ceph_dns_resolve_name(const char *name, size_t namelen, - struct sockaddr_storage *ss, char delim, const char **ipend) + struct ceph_entity_addr *addr, char delim, const char **ipend) { const char *end, *delim_p; char *colon_p, *ip_addr = NULL; @@ -1891,7 +1889,7 @@ static int ceph_dns_resolve_name(const char *name, size_t namelen, /* do dns_resolve upcall */ ip_len = dns_query(NULL, name, end - name, NULL, &ip_addr, NULL); if (ip_len > 0) - ret = ceph_pton(ip_addr, ip_len, ss, -1, NULL); + ret = ceph_pton(ip_addr, ip_len, addr, -1, NULL); else ret = -ESRCH; @@ -1900,13 +1898,13 @@ static int ceph_dns_resolve_name(const char *name, size_t namelen, *ipend = end; pr_info("resolve '%.*s' (ret=%d): %s\n", (int)(end - name), name, - ret, ret ? "failed" : ceph_pr_addr(ss)); + ret, ret ? "failed" : ceph_pr_addr(&addr->in_addr)); return ret; } #else static inline int ceph_dns_resolve_name(const char *name, size_t namelen, - struct sockaddr_storage *ss, char delim, const char **ipend) + struct ceph_entity_addr *addr, char delim, const char **ipend) { return -EINVAL; } @@ -1917,13 +1915,13 @@ static inline int ceph_dns_resolve_name(const char *name, size_t namelen, * then try to extract a hostname to resolve using userspace DNS upcall. */ static int ceph_parse_server_name(const char *name, size_t namelen, - struct sockaddr_storage *ss, char delim, const char **ipend) + struct ceph_entity_addr *addr, char delim, const char **ipend) { int ret; - ret = ceph_pton(name, namelen, ss, delim, ipend); + ret = ceph_pton(name, namelen, addr, delim, ipend); if (ret) - ret = ceph_dns_resolve_name(name, namelen, ss, delim, ipend); + ret = ceph_dns_resolve_name(name, namelen, addr, delim, ipend); return ret; } @@ -1942,7 +1940,6 @@ int ceph_parse_ips(const char *c, const char *end, dout("parse_ips on '%.*s'\n", (int)(end-c), c); for (i = 0; i < max_count; i++) { const char *ipend; - struct sockaddr_storage *ss = &addr[i].in_addr; int port; char delim = ','; @@ -1951,7 +1948,7 @@ int ceph_parse_ips(const char *c, const char *end, p++; } - ret = ceph_parse_server_name(p, end - p, ss, delim, &ipend); + ret = ceph_parse_server_name(p, end - p, &addr[i], delim, &ipend); if (ret) goto bad; ret = -EINVAL; @@ -1982,9 +1979,9 @@ int ceph_parse_ips(const char *c, const char *end, port = CEPH_MON_PORT; } - addr_set_port(ss, port); + addr_set_port(&addr[i], port); - dout("parse_ips got %s\n", ceph_pr_addr(ss)); + dout("parse_ips got %s\n", ceph_pr_addr(&addr[i].in_addr)); if (p == end) break; @@ -2023,7 +2020,7 @@ static int process_banner(struct ceph_connection *con) */ if (memcmp(&con->peer_addr, &con->actual_peer_addr, sizeof(con->peer_addr)) != 0 && - !(addr_is_blank(&con->actual_peer_addr.in_addr) && + !(addr_is_blank(&con->actual_peer_addr) && con->actual_peer_addr.nonce == con->peer_addr.nonce)) { pr_warn("wrong peer, want %s/%d, got %s/%d\n", ceph_pr_addr(&con->peer_addr.in_addr), @@ -2037,13 +2034,13 @@ static int process_banner(struct ceph_connection *con) /* * did we learn our address? */ - if (addr_is_blank(&con->msgr->inst.addr.in_addr)) { - int port = addr_port(&con->msgr->inst.addr.in_addr); + if (addr_is_blank(&con->msgr->inst.addr)) { + int port = addr_port(&con->msgr->inst.addr); memcpy(&con->msgr->inst.addr.in_addr, &con->peer_addr_for_me.in_addr, sizeof(con->peer_addr_for_me.in_addr)); - addr_set_port(&con->msgr->inst.addr.in_addr, port); + addr_set_port(&con->msgr->inst.addr, port); encode_my_addr(con->msgr); dout("process_banner learned my addr is %s\n", ceph_pr_addr(&con->msgr->inst.addr.in_addr)); From b726ec972cf2122137fbc47847b4fcc7b3bc2801 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 6 May 2019 09:38:47 -0400 Subject: [PATCH 27/29] libceph: make ceph_pr_addr take an struct ceph_entity_addr pointer GCC9 is throwing a lot of warnings about unaligned accesses by callers of ceph_pr_addr. All of the current callers are passing a pointer to the sockaddr inside struct ceph_entity_addr. Fix it to take a pointer to a struct ceph_entity_addr instead, and then have the function make a copy of the sockaddr before printing it. Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/debugfs.c | 2 +- fs/ceph/mdsmap.c | 2 +- include/linux/ceph/messenger.h | 3 ++- net/ceph/cls_lock_client.c | 2 +- net/ceph/debugfs.c | 4 +-- net/ceph/messenger.c | 48 +++++++++++++++++----------------- net/ceph/mon_client.c | 6 ++--- net/ceph/osd_client.c | 2 +- 8 files changed, 35 insertions(+), 34 deletions(-) diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index b014fc7d4e3c..b3fc5fe26a1a 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -37,7 +37,7 @@ static int mdsmap_show(struct seq_file *s, void *p) struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr; int state = mdsmap->m_info[i].state; seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, - ceph_pr_addr(&addr->in_addr), + ceph_pr_addr(addr), ceph_mds_state_name(state)); } return 0; diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 1a2c5d390f7f..701b4fb0fb5a 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -205,7 +205,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", i+1, n, global_id, mds, inc, - ceph_pr_addr(&addr.in_addr), + ceph_pr_addr(&addr), ceph_mds_state_name(state)); if (mds < 0 || state <= 0) diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 800a2128d411..23895d178149 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -323,7 +323,8 @@ struct ceph_connection { }; -extern const char *ceph_pr_addr(const struct sockaddr_storage *ss); +extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr); + extern int ceph_parse_ips(const char *c, const char *end, struct ceph_entity_addr *addr, int max_count, int *count); diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c index 2105a6eaa66c..4cc28541281b 100644 --- a/net/ceph/cls_lock_client.c +++ b/net/ceph/cls_lock_client.c @@ -271,7 +271,7 @@ static int decode_locker(void **p, void *end, struct ceph_locker *locker) dout("%s %s%llu cookie %s addr %s\n", __func__, ENTITY_NAME(locker->id.name), locker->id.cookie, - ceph_pr_addr(&locker->info.addr.in_addr)); + ceph_pr_addr(&locker->info.addr)); return 0; } diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 46f65709a6ff..63aef9915f75 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -46,7 +46,7 @@ static int monmap_show(struct seq_file *s, void *p) seq_printf(s, "\t%s%lld\t%s\n", ENTITY_NAME(inst->name), - ceph_pr_addr(&inst->addr.in_addr)); + ceph_pr_addr(&inst->addr)); } return 0; } @@ -82,7 +82,7 @@ static int osdmap_show(struct seq_file *s, void *p) char sb[64]; seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n", - i, ceph_pr_addr(&addr->in_addr), + i, ceph_pr_addr(addr), ((map->osd_weight[i]*100) >> 16), ceph_osdmap_state_str(sb, sizeof(sb), state), ((ceph_get_primary_affinity(map, i)*100) >> 16)); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 3e0cc9808ae4..3ee380758ddd 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -186,17 +186,18 @@ static atomic_t addr_str_seq = ATOMIC_INIT(0); static struct page *zero_page; /* used in certain error cases */ -const char *ceph_pr_addr(const struct sockaddr_storage *ss) +const char *ceph_pr_addr(const struct ceph_entity_addr *addr) { int i; char *s; - struct sockaddr_in *in4 = (struct sockaddr_in *) ss; - struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss; + struct sockaddr_storage ss = addr->in_addr; /* align */ + struct sockaddr_in *in4 = (struct sockaddr_in *)&ss; + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)&ss; i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK; s = addr_str[i]; - switch (ss->ss_family) { + switch (ss.ss_family) { case AF_INET: snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%hu", &in4->sin_addr, ntohs(in4->sin_port)); @@ -209,7 +210,7 @@ const char *ceph_pr_addr(const struct sockaddr_storage *ss) default: snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %hu)", - ss->ss_family); + ss.ss_family); } return s; @@ -471,18 +472,18 @@ static int ceph_tcp_connect(struct ceph_connection *con) set_sock_callbacks(sock, con); - dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr)); + dout("connect %s\n", ceph_pr_addr(&con->peer_addr)); con_sock_state_connecting(con); ret = sock->ops->connect(sock, (struct sockaddr *)&ss, sizeof(ss), O_NONBLOCK); if (ret == -EINPROGRESS) { dout("connect %s EINPROGRESS sk_state = %u\n", - ceph_pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr), sock->sk->sk_state); } else if (ret < 0) { pr_err("connect %s error %d\n", - ceph_pr_addr(&con->peer_addr.in_addr), ret); + ceph_pr_addr(&con->peer_addr), ret); sock_release(sock); return ret; } @@ -669,8 +670,7 @@ static void reset_connection(struct ceph_connection *con) void ceph_con_close(struct ceph_connection *con) { mutex_lock(&con->mutex); - dout("con_close %p peer %s\n", con, - ceph_pr_addr(&con->peer_addr.in_addr)); + dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr)); con->state = CON_STATE_CLOSED; con_flag_clear(con, CON_FLAG_LOSSYTX); /* so we retry next connect */ @@ -694,7 +694,7 @@ void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr) { mutex_lock(&con->mutex); - dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr)); + dout("con_open %p %s\n", con, ceph_pr_addr(addr)); WARN_ON(con->state != CON_STATE_CLOSED); con->state = CON_STATE_PREOPEN; @@ -1788,7 +1788,7 @@ static int verify_hello(struct ceph_connection *con) { if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) { pr_err("connect to %s got bad banner\n", - ceph_pr_addr(&con->peer_addr.in_addr)); + ceph_pr_addr(&con->peer_addr)); con->error_msg = "protocol error, bad banner"; return -1; } @@ -1898,7 +1898,7 @@ static int ceph_dns_resolve_name(const char *name, size_t namelen, *ipend = end; pr_info("resolve '%.*s' (ret=%d): %s\n", (int)(end - name), name, - ret, ret ? "failed" : ceph_pr_addr(&addr->in_addr)); + ret, ret ? "failed" : ceph_pr_addr(addr)); return ret; } @@ -1981,7 +1981,7 @@ int ceph_parse_ips(const char *c, const char *end, addr_set_port(&addr[i], port); - dout("parse_ips got %s\n", ceph_pr_addr(&addr[i].in_addr)); + dout("parse_ips got %s\n", ceph_pr_addr(&addr[i])); if (p == end) break; @@ -2023,9 +2023,9 @@ static int process_banner(struct ceph_connection *con) !(addr_is_blank(&con->actual_peer_addr) && con->actual_peer_addr.nonce == con->peer_addr.nonce)) { pr_warn("wrong peer, want %s/%d, got %s/%d\n", - ceph_pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr), (int)le32_to_cpu(con->peer_addr.nonce), - ceph_pr_addr(&con->actual_peer_addr.in_addr), + ceph_pr_addr(&con->actual_peer_addr), (int)le32_to_cpu(con->actual_peer_addr.nonce)); con->error_msg = "wrong peer at address"; return -1; @@ -2043,7 +2043,7 @@ static int process_banner(struct ceph_connection *con) addr_set_port(&con->msgr->inst.addr, port); encode_my_addr(con->msgr); dout("process_banner learned my addr is %s\n", - ceph_pr_addr(&con->msgr->inst.addr.in_addr)); + ceph_pr_addr(&con->msgr->inst.addr)); } return 0; @@ -2094,7 +2094,7 @@ static int process_connect(struct ceph_connection *con) pr_err("%s%lld %s feature set mismatch," " my %llx < server's %llx, missing %llx\n", ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr), sup_feat, server_feat, server_feat & ~sup_feat); con->error_msg = "missing required protocol features"; reset_connection(con); @@ -2104,7 +2104,7 @@ static int process_connect(struct ceph_connection *con) pr_err("%s%lld %s protocol version mismatch," " my %d != server's %d\n", ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr), le32_to_cpu(con->out_connect.protocol_version), le32_to_cpu(con->in_reply.protocol_version)); con->error_msg = "protocol version mismatch"; @@ -2138,7 +2138,7 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->in_reply.connect_seq)); pr_err("%s%lld %s connection reset\n", ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr.in_addr)); + ceph_pr_addr(&con->peer_addr)); reset_connection(con); con_out_kvec_reset(con); ret = prepare_write_connect(con); @@ -2195,7 +2195,7 @@ static int process_connect(struct ceph_connection *con) pr_err("%s%lld %s protocol feature mismatch," " my required %llx > server's %llx, need %llx\n", ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr), req_feat, server_feat, req_feat & ~server_feat); con->error_msg = "missing required protocol features"; reset_connection(con); @@ -2402,7 +2402,7 @@ static int read_partial_message(struct ceph_connection *con) if ((s64)seq - (s64)con->in_seq < 1) { pr_info("skipping %s%lld %s seq %lld expected %lld\n", ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr), seq, con->in_seq + 1); con->in_base_pos = -front_len - middle_len - data_len - sizeof_footer(con); @@ -2981,10 +2981,10 @@ static void ceph_con_workfn(struct work_struct *work) static void con_fault(struct ceph_connection *con) { dout("fault %p state %lu to peer %s\n", - con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); + con, con->state, ceph_pr_addr(&con->peer_addr)); pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); + ceph_pr_addr(&con->peer_addr), con->error_msg); con->error_msg = NULL; WARN_ON(con->state != CON_STATE_CONNECTING && diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index a53e4fbb6319..895679d3529b 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -76,7 +76,7 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end) m->num_mon); for (i = 0; i < m->num_mon; i++) dout("monmap_decode mon%d is %s\n", i, - ceph_pr_addr(&m->mon_inst[i].addr.in_addr)); + ceph_pr_addr(&m->mon_inst[i].addr)); return m; bad: @@ -203,7 +203,7 @@ static void reopen_session(struct ceph_mon_client *monc) { if (!monc->hunting) pr_info("mon%d %s session lost, hunting for new mon\n", - monc->cur_mon, ceph_pr_addr(&monc->con.peer_addr.in_addr)); + monc->cur_mon, ceph_pr_addr(&monc->con.peer_addr)); __close_session(monc); __open_session(monc); @@ -1178,7 +1178,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc, __resend_generic_request(monc); pr_info("mon%d %s session established\n", monc->cur_mon, - ceph_pr_addr(&monc->con.peer_addr.in_addr)); + ceph_pr_addr(&monc->con.peer_addr)); } out: diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index fa9530dd876e..e6d31e0f0289 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -4926,7 +4926,7 @@ static int decode_watcher(void **p, void *end, struct ceph_watch_item *item) dout("%s %s%llu cookie %llu addr %s\n", __func__, ENTITY_NAME(item->name), item->cookie, - ceph_pr_addr(&item->addr.in_addr)); + ceph_pr_addr(&item->addr)); return 0; } From 4198aba4f431e2d2c24a11cdb9d53106fda00b43 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 2 May 2019 08:06:50 -0400 Subject: [PATCH 28/29] ceph: fix unaligned access in ceph_send_cap_releases Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 5f3f375cf0e9..959b1bf7c327 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1855,7 +1855,8 @@ again: num_cap_releases--; head = msg->front.iov_base; - le32_add_cpu(&head->num, 1); + put_unaligned_le32(get_unaligned_le32(&head->num) + 1, + &head->num); item = msg->front.iov_base + msg->front.iov_len; item->ino = cpu_to_le64(cap->cap_ino); item->cap_id = cpu_to_le64(cap->cap_id); From 00abf69dd24f4444d185982379c5cc3bb7b6d1fc Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 7 May 2019 09:20:54 -0400 Subject: [PATCH 29/29] ceph: flush dirty inodes before proceeding with remount xfstest generic/452 was triggering a "Busy inodes after umount" warning. ceph was allowing the mount to go read-only without first flushing out dirty inodes in the cache. Ensure we sync out the filesystem before allowing a remount to proceed. Cc: stable@vger.kernel.org Link: http://tracker.ceph.com/issues/39571 Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 6d5bb2f74612..01113c86e469 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -845,6 +845,12 @@ static void ceph_umount_begin(struct super_block *sb) return; } +static int ceph_remount(struct super_block *sb, int *flags, char *data) +{ + sync_filesystem(sb); + return 0; +} + static const struct super_operations ceph_super_ops = { .alloc_inode = ceph_alloc_inode, .destroy_inode = ceph_destroy_inode, @@ -852,6 +858,7 @@ static const struct super_operations ceph_super_ops = { .drop_inode = ceph_drop_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, + .remount_fs = ceph_remount, .show_options = ceph_show_options, .statfs = ceph_statfs, .umount_begin = ceph_umount_begin,