Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs

Pull vfs updates from Al Viro:
 "Stuff in here:

   - acct.c fixes and general rework of mnt_pin mechanism.  That allows
     to go for delayed-mntput stuff, which will permit mntput() on deep
     stack without worrying about stack overflows - fs shutdown will
     happen on shallow stack.  IOW, we can do Eric's umount-on-rmdir
     series without introducing tons of stack overflows on new mntput()
     call chains it introduces.
   - Bruce's d_splice_alias() patches
   - more Miklos' rename() stuff.
   - a couple of regression fixes (stable fodder, in the end of branch)
     and a fix for API idiocy in iov_iter.c.

  There definitely will be another pile, maybe even two.  I'd like to
  get Eric's series in this time, but even if we miss it, it'll go right
  in the beginning of for-next in the next cycle - the tricky part of
  prereqs is in this pile"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (40 commits)
  fix copy_tree() regression
  __generic_file_write_iter(): fix handling of sync error after DIO
  switch iov_iter_get_pages() to passing maximal number of pages
  fs: mark __d_obtain_alias static
  dcache: d_splice_alias should detect loops
  exportfs: update Exporting documentation
  dcache: d_find_alias needn't recheck IS_ROOT && DCACHE_DISCONNECTED
  dcache: remove unused d_find_alias parameter
  dcache: d_obtain_alias callers don't all want DISCONNECTED
  dcache: d_splice_alias should ignore DCACHE_DISCONNECTED
  dcache: d_splice_alias mustn't create directory aliases
  dcache: close d_move race in d_splice_alias
  dcache: move d_splice_alias
  namei: trivial fix to vfs_rename_dir comment
  VFS: allow ->d_manage() to declare -EISDIR in rcu_walk mode.
  cifs: support RENAME_NOREPLACE
  hostfs: support rename flags
  shmem: support RENAME_EXCHANGE
  shmem: support RENAME_NOREPLACE
  btrfs: add RENAME_NOREPLACE
  ...
This commit is contained in:
Linus Torvalds 2014-08-11 11:44:11 -07:00
commit f6f993328b
36 changed files with 641 additions and 461 deletions

View File

@ -66,23 +66,31 @@ b/ A per-superblock list "s_anon" of dentries which are the roots of
c/ Helper routines to allocate anonymous dentries, and to help attach
loose directory dentries at lookup time. They are:
d_alloc_anon(inode) will return a dentry for the given inode.
d_obtain_alias(inode) will return a dentry for the given inode.
If the inode already has a dentry, one of those is returned.
If it doesn't, a new anonymous (IS_ROOT and
DCACHE_DISCONNECTED) dentry is allocated and attached.
In the case of a directory, care is taken that only one dentry
can ever be attached.
d_splice_alias(inode, dentry) will make sure that there is a
dentry with the same name and parent as the given dentry, and
which refers to the given inode.
If the inode is a directory and already has a dentry, then that
dentry is d_moved over the given dentry.
If the passed dentry gets attached, care is taken that this is
mutually exclusive to a d_alloc_anon operation.
If the passed dentry is used, NULL is returned, else the used
dentry is returned. This corresponds to the calling pattern of
->lookup.
d_splice_alias(inode, dentry) or d_materialise_unique(dentry, inode)
will introduce a new dentry into the tree; either the passed-in
dentry or a preexisting alias for the given inode (such as an
anonymous one created by d_obtain_alias), if appropriate. The two
functions differ in their handling of directories with preexisting
aliases:
d_splice_alias will use any existing IS_ROOT dentry, but it will
return -EIO rather than try to move a dentry with a different
parent. This is appropriate for local filesystems, which
should never see such an alias unless the filesystem is
corrupted somehow (for example, if two on-disk directory
entries refer to the same directory.)
d_materialise_unique will attempt to move any dentry. This is
appropriate for distributed filesystems, where finding a
directory other than where we last cached it may be a normal
consequence of concurrent operations on other hosts.
Both functions return NULL when the passed-in dentry is used,
following the calling convention of ->lookup.
Filesystem Issues
-----------------
@ -120,12 +128,12 @@ struct which has the following members:
fh_to_dentry (mandatory)
Given a filehandle fragment, this should find the implied object and
create a dentry for it (possibly with d_alloc_anon).
create a dentry for it (possibly with d_obtain_alias).
fh_to_parent (optional but strongly recommended)
Given a filehandle fragment, this should find the parent of the
implied object and create a dentry for it (possibly with d_alloc_anon).
May fail if the filehandle fragment is too small.
implied object and create a dentry for it (possibly with
d_obtain_alias). May fail if the filehandle fragment is too small.
get_parent (optional but strongly recommended)
When given a dentry for a directory, this should return a dentry for

View File

@ -1053,7 +1053,8 @@ struct dentry_operations {
If the 'rcu_walk' parameter is true, then the caller is doing a
pathwalk in RCU-walk mode. Sleeping is not permitted in this mode,
and the caller can be asked to leave it and call again by returning
-ECHILD.
-ECHILD. -EISDIR may also be returned to tell pathwalk to
ignore d_automount or any mounts.
This function is only used if DCACHE_MANAGE_TRANSIT is set on the
dentry being transited from.

View File

@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
attr.o bad_inode.o file.o filesystems.o namespace.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o splice.o sync.o utimes.o \
stack.o fs_struct.o statfs.o
stack.o fs_struct.o statfs.o fs_pin.o
ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o block_dev.o direct-io.o mpage.o

View File

@ -218,8 +218,9 @@ static int bad_inode_mknod (struct inode *dir, struct dentry *dentry,
return -EIO;
}
static int bad_inode_rename (struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
static int bad_inode_rename2(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
return -EIO;
}
@ -279,7 +280,7 @@ static const struct inode_operations bad_inode_ops =
.mkdir = bad_inode_mkdir,
.rmdir = bad_inode_rmdir,
.mknod = bad_inode_mknod,
.rename = bad_inode_rename,
.rename2 = bad_inode_rename2,
.readlink = bad_inode_readlink,
/* follow_link must be no-op, otherwise unmounting this inode
won't work */

View File

@ -8476,6 +8476,16 @@ out_notrans:
return ret;
}
static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
if (flags & ~RENAME_NOREPLACE)
return -EINVAL;
return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
}
static void btrfs_run_delalloc_work(struct btrfs_work *work)
{
struct btrfs_delalloc_work *delalloc_work;
@ -9019,7 +9029,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
.link = btrfs_link,
.mkdir = btrfs_mkdir,
.rmdir = btrfs_rmdir,
.rename = btrfs_rename,
.rename2 = btrfs_rename2,
.symlink = btrfs_symlink,
.setattr = btrfs_setattr,
.mknod = btrfs_mknod,

View File

@ -851,7 +851,6 @@ static struct dentry *get_default_root(struct super_block *sb,
struct btrfs_path *path;
struct btrfs_key location;
struct inode *inode;
struct dentry *dentry;
u64 dir_id;
int new = 0;
@ -922,13 +921,7 @@ setup_root:
return dget(sb->s_root);
}
dentry = d_obtain_alias(inode);
if (!IS_ERR(dentry)) {
spin_lock(&dentry->d_lock);
dentry->d_flags &= ~DCACHE_DISCONNECTED;
spin_unlock(&dentry->d_lock);
}
return dentry;
return d_obtain_root(inode);
}
static int btrfs_fill_super(struct super_block *sb,

View File

@ -755,7 +755,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
goto out;
}
} else {
root = d_obtain_alias(inode);
root = d_obtain_root(inode);
}
ceph_init_dentry(root);
dout("open_root_inode success, root dentry is %p\n", root);

View File

@ -848,7 +848,7 @@ const struct inode_operations cifs_dir_inode_ops = {
.link = cifs_hardlink,
.mkdir = cifs_mkdir,
.rmdir = cifs_rmdir,
.rename = cifs_rename,
.rename2 = cifs_rename2,
.permission = cifs_permission,
/* revalidate:cifs_revalidate, */
.setattr = cifs_setattr,

View File

@ -68,8 +68,8 @@ extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
extern int cifs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
extern int cifs_mkdir(struct inode *, struct dentry *, umode_t);
extern int cifs_rmdir(struct inode *, struct dentry *);
extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
struct dentry *);
extern int cifs_rename2(struct inode *, struct dentry *, struct inode *,
struct dentry *, unsigned int);
extern int cifs_revalidate_file_attr(struct file *filp);
extern int cifs_revalidate_dentry_attr(struct dentry *);
extern int cifs_revalidate_file(struct file *filp);

View File

@ -1627,8 +1627,9 @@ do_rename_exit:
}
int
cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
struct inode *target_dir, struct dentry *target_dentry)
cifs_rename2(struct inode *source_dir, struct dentry *source_dentry,
struct inode *target_dir, struct dentry *target_dentry,
unsigned int flags)
{
char *from_name = NULL;
char *to_name = NULL;
@ -1640,6 +1641,9 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
unsigned int xid;
int rc, tmprc;
if (flags & ~RENAME_NOREPLACE)
return -EINVAL;
cifs_sb = CIFS_SB(source_dir->i_sb);
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
@ -1667,6 +1671,12 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry,
to_name);
/*
* No-replace is the natural behavior for CIFS, so skip unlink hacks.
*/
if (flags & RENAME_NOREPLACE)
goto cifs_rename_exit;
if (rc == -EEXIST && tcon->unix_ext) {
/*
* Are src and dst hardlinks of same inode? We can only tell

View File

@ -731,8 +731,6 @@ EXPORT_SYMBOL(dget_parent);
/**
* d_find_alias - grab a hashed alias of inode
* @inode: inode in question
* @want_discon: flag, used by d_splice_alias, to request
* that only a DISCONNECTED alias be returned.
*
* If inode has a hashed alias, or is a directory and has any alias,
* acquire the reference to alias and return it. Otherwise return NULL.
@ -741,10 +739,9 @@ EXPORT_SYMBOL(dget_parent);
* of a filesystem.
*
* If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
* any other hashed alias over that one unless @want_discon is set,
* in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias.
* any other hashed alias over that one.
*/
static struct dentry *__d_find_alias(struct inode *inode, int want_discon)
static struct dentry *__d_find_alias(struct inode *inode)
{
struct dentry *alias, *discon_alias;
@ -756,7 +753,7 @@ again:
if (IS_ROOT(alias) &&
(alias->d_flags & DCACHE_DISCONNECTED)) {
discon_alias = alias;
} else if (!want_discon) {
} else {
__dget_dlock(alias);
spin_unlock(&alias->d_lock);
return alias;
@ -768,12 +765,9 @@ again:
alias = discon_alias;
spin_lock(&alias->d_lock);
if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
if (IS_ROOT(alias) &&
(alias->d_flags & DCACHE_DISCONNECTED)) {
__dget_dlock(alias);
spin_unlock(&alias->d_lock);
return alias;
}
__dget_dlock(alias);
spin_unlock(&alias->d_lock);
return alias;
}
spin_unlock(&alias->d_lock);
goto again;
@ -787,7 +781,7 @@ struct dentry *d_find_alias(struct inode *inode)
if (!hlist_empty(&inode->i_dentry)) {
spin_lock(&inode->i_lock);
de = __d_find_alias(inode, 0);
de = __d_find_alias(inode);
spin_unlock(&inode->i_lock);
}
return de;
@ -1781,25 +1775,7 @@ struct dentry *d_find_any_alias(struct inode *inode)
}
EXPORT_SYMBOL(d_find_any_alias);
/**
* d_obtain_alias - find or allocate a dentry for a given inode
* @inode: inode to allocate the dentry for
*
* Obtain a dentry for an inode resulting from NFS filehandle conversion or
* similar open by handle operations. The returned dentry may be anonymous,
* or may have a full name (if the inode was already in the cache).
*
* When called on a directory inode, we must ensure that the inode only ever
* has one dentry. If a dentry is found, that is returned instead of
* allocating a new one.
*
* On successful return, the reference to the inode has been transferred
* to the dentry. In case of an error the reference on the inode is released.
* To make it easier to use in export operations a %NULL or IS_ERR inode may
* be passed in and will be the error will be propagate to the return value,
* with a %NULL @inode replaced by ERR_PTR(-ESTALE).
*/
struct dentry *d_obtain_alias(struct inode *inode)
static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected)
{
static const struct qstr anonstring = QSTR_INIT("/", 1);
struct dentry *tmp;
@ -1830,7 +1806,10 @@ struct dentry *d_obtain_alias(struct inode *inode)
}
/* attach a disconnected dentry */
add_flags = d_flags_for_inode(inode) | DCACHE_DISCONNECTED;
add_flags = d_flags_for_inode(inode);
if (disconnected)
add_flags |= DCACHE_DISCONNECTED;
spin_lock(&tmp->d_lock);
tmp->d_inode = inode;
@ -1851,59 +1830,51 @@ struct dentry *d_obtain_alias(struct inode *inode)
iput(inode);
return res;
}
/**
* d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode
* @inode: inode to allocate the dentry for
*
* Obtain a dentry for an inode resulting from NFS filehandle conversion or
* similar open by handle operations. The returned dentry may be anonymous,
* or may have a full name (if the inode was already in the cache).
*
* When called on a directory inode, we must ensure that the inode only ever
* has one dentry. If a dentry is found, that is returned instead of
* allocating a new one.
*
* On successful return, the reference to the inode has been transferred
* to the dentry. In case of an error the reference on the inode is released.
* To make it easier to use in export operations a %NULL or IS_ERR inode may
* be passed in and the error will be propagated to the return value,
* with a %NULL @inode replaced by ERR_PTR(-ESTALE).
*/
struct dentry *d_obtain_alias(struct inode *inode)
{
return __d_obtain_alias(inode, 1);
}
EXPORT_SYMBOL(d_obtain_alias);
/**
* d_splice_alias - splice a disconnected dentry into the tree if one exists
* @inode: the inode which may have a disconnected dentry
* @dentry: a negative dentry which we want to point to the inode.
* d_obtain_root - find or allocate a dentry for a given inode
* @inode: inode to allocate the dentry for
*
* If inode is a directory and has a 'disconnected' dentry (i.e. IS_ROOT and
* DCACHE_DISCONNECTED), then d_move that in place of the given dentry
* and return it, else simply d_add the inode to the dentry and return NULL.
* Obtain an IS_ROOT dentry for the root of a filesystem.
*
* This is needed in the lookup routine of any filesystem that is exportable
* (via knfsd) so that we can build dcache paths to directories effectively.
* We must ensure that directory inodes only ever have one dentry. If a
* dentry is found, that is returned instead of allocating a new one.
*
* If a dentry was found and moved, then it is returned. Otherwise NULL
* is returned. This matches the expected return value of ->lookup.
*
* Cluster filesystems may call this function with a negative, hashed dentry.
* In that case, we know that the inode will be a regular file, and also this
* will only occur during atomic_open. So we need to check for the dentry
* being already hashed only in the final case.
* On successful return, the reference to the inode has been transferred
* to the dentry. In case of an error the reference on the inode is
* released. A %NULL or IS_ERR inode may be passed in and will be the
* error will be propagate to the return value, with a %NULL @inode
* replaced by ERR_PTR(-ESTALE).
*/
struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
struct dentry *d_obtain_root(struct inode *inode)
{
struct dentry *new = NULL;
if (IS_ERR(inode))
return ERR_CAST(inode);
if (inode && S_ISDIR(inode->i_mode)) {
spin_lock(&inode->i_lock);
new = __d_find_alias(inode, 1);
if (new) {
BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
spin_unlock(&inode->i_lock);
security_d_instantiate(new, inode);
d_move(new, dentry);
iput(inode);
} else {
/* already taking inode->i_lock, so d_add() by hand */
__d_instantiate(dentry, inode);
spin_unlock(&inode->i_lock);
security_d_instantiate(dentry, inode);
d_rehash(dentry);
}
} else {
d_instantiate(dentry, inode);
if (d_unhashed(dentry))
d_rehash(dentry);
}
return new;
return __d_obtain_alias(inode, 0);
}
EXPORT_SYMBOL(d_splice_alias);
EXPORT_SYMBOL(d_obtain_root);
/**
* d_add_ci - lookup or allocate new dentry with case-exact name
@ -2696,6 +2667,75 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
/* anon->d_lock still locked, returns locked */
}
/**
* d_splice_alias - splice a disconnected dentry into the tree if one exists
* @inode: the inode which may have a disconnected dentry
* @dentry: a negative dentry which we want to point to the inode.
*
* If inode is a directory and has an IS_ROOT alias, then d_move that in
* place of the given dentry and return it, else simply d_add the inode
* to the dentry and return NULL.
*
* If a non-IS_ROOT directory is found, the filesystem is corrupt, and
* we should error out: directories can't have multiple aliases.
*
* This is needed in the lookup routine of any filesystem that is exportable
* (via knfsd) so that we can build dcache paths to directories effectively.
*
* If a dentry was found and moved, then it is returned. Otherwise NULL
* is returned. This matches the expected return value of ->lookup.
*
* Cluster filesystems may call this function with a negative, hashed dentry.
* In that case, we know that the inode will be a regular file, and also this
* will only occur during atomic_open. So we need to check for the dentry
* being already hashed only in the final case.
*/
struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
{
struct dentry *new = NULL;
if (IS_ERR(inode))
return ERR_CAST(inode);
if (inode && S_ISDIR(inode->i_mode)) {
spin_lock(&inode->i_lock);
new = __d_find_any_alias(inode);
if (new) {
if (!IS_ROOT(new)) {
spin_unlock(&inode->i_lock);
dput(new);
return ERR_PTR(-EIO);
}
if (d_ancestor(new, dentry)) {
spin_unlock(&inode->i_lock);
dput(new);
return ERR_PTR(-EIO);
}
write_seqlock(&rename_lock);
__d_materialise_dentry(dentry, new);
write_sequnlock(&rename_lock);
__d_drop(new);
_d_rehash(new);
spin_unlock(&new->d_lock);
spin_unlock(&inode->i_lock);
security_d_instantiate(new, inode);
iput(inode);
} else {
/* already taking inode->i_lock, so d_add() by hand */
__d_instantiate(dentry, inode);
spin_unlock(&inode->i_lock);
security_d_instantiate(dentry, inode);
d_rehash(dentry);
}
} else {
d_instantiate(dentry, inode);
if (d_unhashed(dentry))
d_rehash(dentry);
}
return new;
}
EXPORT_SYMBOL(d_splice_alias);
/**
* d_materialise_unique - introduce an inode into the tree
* @dentry: candidate dentry
@ -2724,7 +2764,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
struct dentry *alias;
/* Does an aliased dentry already exist? */
alias = __d_find_alias(inode, 0);
alias = __d_find_alias(inode);
if (alias) {
actual = alias;
write_seqlock(&rename_lock);

View File

@ -158,7 +158,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
{
ssize_t ret;
ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES * PAGE_SIZE,
ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES,
&sdio->from);
if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {

View File

@ -3455,7 +3455,6 @@ const struct inode_operations ext4_dir_inode_operations = {
.rmdir = ext4_rmdir,
.mknod = ext4_mknod,
.tmpfile = ext4_tmpfile,
.rename = ext4_rename,
.rename2 = ext4_rename2,
.setattr = ext4_setattr,
.setxattr = generic_setxattr,

78
fs/fs_pin.c Normal file
View File

@ -0,0 +1,78 @@
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/fs_pin.h>
#include "internal.h"
#include "mount.h"
static void pin_free_rcu(struct rcu_head *head)
{
kfree(container_of(head, struct fs_pin, rcu));
}
static DEFINE_SPINLOCK(pin_lock);
void pin_put(struct fs_pin *p)
{
if (atomic_long_dec_and_test(&p->count))
call_rcu(&p->rcu, pin_free_rcu);
}
void pin_remove(struct fs_pin *pin)
{
spin_lock(&pin_lock);
hlist_del(&pin->m_list);
hlist_del(&pin->s_list);
spin_unlock(&pin_lock);
}
void pin_insert(struct fs_pin *pin, struct vfsmount *m)
{
spin_lock(&pin_lock);
hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins);
hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins);
spin_unlock(&pin_lock);
}
void mnt_pin_kill(struct mount *m)
{
while (1) {
struct hlist_node *p;
struct fs_pin *pin;
rcu_read_lock();
p = ACCESS_ONCE(m->mnt_pins.first);
if (!p) {
rcu_read_unlock();
break;
}
pin = hlist_entry(p, struct fs_pin, m_list);
if (!atomic_long_inc_not_zero(&pin->count)) {
rcu_read_unlock();
cpu_relax();
continue;
}
rcu_read_unlock();
pin->kill(pin);
}
}
void sb_pin_kill(struct super_block *sb)
{
while (1) {
struct hlist_node *p;
struct fs_pin *pin;
rcu_read_lock();
p = ACCESS_ONCE(sb->s_pins.first);
if (!p) {
rcu_read_unlock();
break;
}
pin = hlist_entry(p, struct fs_pin, s_list);
if (!atomic_long_inc_not_zero(&pin->count)) {
rcu_read_unlock();
cpu_relax();
continue;
}
rcu_read_unlock();
pin->kill(pin);
}
}

View File

@ -845,12 +845,6 @@ static int fuse_rename2(struct inode *olddir, struct dentry *oldent,
return err;
}
static int fuse_rename(struct inode *olddir, struct dentry *oldent,
struct inode *newdir, struct dentry *newent)
{
return fuse_rename2(olddir, oldent, newdir, newent, 0);
}
static int fuse_link(struct dentry *entry, struct inode *newdir,
struct dentry *newent)
{
@ -2024,7 +2018,6 @@ static const struct inode_operations fuse_dir_inode_operations = {
.symlink = fuse_symlink,
.unlink = fuse_unlink,
.rmdir = fuse_rmdir,
.rename = fuse_rename,
.rename2 = fuse_rename2,
.link = fuse_link,
.setattr = fuse_setattr,

View File

@ -1303,10 +1303,10 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
unsigned npages;
size_t start;
unsigned n = req->max_pages - req->num_pages;
ssize_t ret = iov_iter_get_pages(ii,
&req->pages[req->num_pages],
n * PAGE_SIZE, &start);
req->max_pages - req->num_pages,
&start);
if (ret < 0)
return ret;

View File

@ -89,6 +89,7 @@ extern int do_mknod(const char *file, int mode, unsigned int major,
extern int link_file(const char *from, const char *to);
extern int hostfs_do_readlink(char *file, char *buf, int size);
extern int rename_file(char *from, char *to);
extern int rename2_file(char *from, char *to, unsigned int flags);
extern int do_statfs(char *root, long *bsize_out, long long *blocks_out,
long long *bfree_out, long long *bavail_out,
long long *files_out, long long *ffree_out,

View File

@ -741,21 +741,31 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
return err;
}
static int hostfs_rename(struct inode *from_ino, struct dentry *from,
struct inode *to_ino, struct dentry *to)
static int hostfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
char *from_name, *to_name;
char *old_name, *new_name;
int err;
if ((from_name = dentry_name(from)) == NULL)
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
return -EINVAL;
old_name = dentry_name(old_dentry);
if (old_name == NULL)
return -ENOMEM;
if ((to_name = dentry_name(to)) == NULL) {
__putname(from_name);
new_name = dentry_name(new_dentry);
if (new_name == NULL) {
__putname(old_name);
return -ENOMEM;
}
err = rename_file(from_name, to_name);
__putname(from_name);
__putname(to_name);
if (!flags)
err = rename_file(old_name, new_name);
else
err = rename2_file(old_name, new_name, flags);
__putname(old_name);
__putname(new_name);
return err;
}
@ -867,7 +877,7 @@ static const struct inode_operations hostfs_dir_iops = {
.mkdir = hostfs_mkdir,
.rmdir = hostfs_rmdir,
.mknod = hostfs_mknod,
.rename = hostfs_rename,
.rename2 = hostfs_rename2,
.permission = hostfs_permission,
.setattr = hostfs_setattr,
};

View File

@ -14,6 +14,7 @@
#include <sys/time.h>
#include <sys/types.h>
#include <sys/vfs.h>
#include <sys/syscall.h>
#include "hostfs.h"
#include <utime.h>
@ -360,6 +361,33 @@ int rename_file(char *from, char *to)
return 0;
}
int rename2_file(char *from, char *to, unsigned int flags)
{
int err;
#ifndef SYS_renameat2
# ifdef __x86_64__
# define SYS_renameat2 316
# endif
# ifdef __i386__
# define SYS_renameat2 353
# endif
#endif
#ifdef SYS_renameat2
err = syscall(SYS_renameat2, AT_FDCWD, from, AT_FDCWD, to, flags);
if (err < 0) {
if (errno != ENOSYS)
return -errno;
else
return -EINVAL;
}
return 0;
#else
return -EINVAL;
#endif
}
int do_statfs(char *root, long *bsize_out, long long *blocks_out,
long long *bfree_out, long long *bavail_out,
long long *files_out, long long *ffree_out,

View File

@ -131,7 +131,6 @@ extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
/*
* read_write.c
*/
extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
/*
@ -144,3 +143,9 @@ extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
* pipe.c
*/
extern const struct file_operations pipefifo_fops;
/*
* fs_pin.c
*/
extern void sb_pin_kill(struct super_block *sb);
extern void mnt_pin_kill(struct mount *m);

View File

@ -55,7 +55,7 @@ struct mount {
int mnt_id; /* mount identifier */
int mnt_group_id; /* peer group identifier */
int mnt_expiry_mark; /* true if marked for expiry */
int mnt_pinned;
struct hlist_head mnt_pins;
struct path mnt_ex_mountpoint;
};

View File

@ -1091,10 +1091,10 @@ int follow_down_one(struct path *path)
}
EXPORT_SYMBOL(follow_down_one);
static inline bool managed_dentry_might_block(struct dentry *dentry)
static inline int managed_dentry_rcu(struct dentry *dentry)
{
return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
dentry->d_op->d_manage(dentry, true) < 0);
return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
dentry->d_op->d_manage(dentry, true) : 0;
}
/*
@ -1110,11 +1110,18 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
* Don't forget we might have a non-mountpoint managed dentry
* that wants to block transit.
*/
if (unlikely(managed_dentry_might_block(path->dentry)))
switch (managed_dentry_rcu(path->dentry)) {
case -ECHILD:
default:
return false;
case -EISDIR:
return true;
case 0:
break;
}
if (!d_mountpoint(path->dentry))
return true;
return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
mounted = __lookup_mnt(path->mnt, path->dentry);
if (!mounted)
@ -1130,7 +1137,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
*/
*inode = path->dentry->d_inode;
}
return read_seqretry(&mount_lock, nd->m_seq);
return read_seqretry(&mount_lock, nd->m_seq) &&
!(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
}
static int follow_dotdot_rcu(struct nameidata *nd)
@ -1402,11 +1410,8 @@ static int lookup_fast(struct nameidata *nd,
}
path->mnt = mnt;
path->dentry = dentry;
if (unlikely(!__follow_mount_rcu(nd, path, inode)))
goto unlazy;
if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
goto unlazy;
return 0;
if (likely(__follow_mount_rcu(nd, path, inode)))
return 0;
unlazy:
if (unlazy_walk(nd, dentry))
return -ECHILD;
@ -4019,7 +4024,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
* The worst of all namespace operations - renaming directory. "Perverted"
* doesn't even start to describe it. Somebody in UCB had a heck of a trip...
* Problems:
* a) we can get into loop creation. Check is done in is_subdir().
* a) we can get into loop creation.
* b) race potential - two innocent renames can create a loop together.
* That's where 4.4 screws up. Current fix: serialization on
* sb->s_vfs_rename_mutex. We might be more accurate, but that's another
@ -4075,7 +4080,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (error)
return error;
if (!old_dir->i_op->rename)
if (!old_dir->i_op->rename && !old_dir->i_op->rename2)
return -EPERM;
if (flags && !old_dir->i_op->rename2)
@ -4134,10 +4139,11 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (error)
goto out;
}
if (!flags) {
if (!old_dir->i_op->rename2) {
error = old_dir->i_op->rename(old_dir, old_dentry,
new_dir, new_dentry);
} else {
WARN_ON(old_dir->i_op->rename != NULL);
error = old_dir->i_op->rename2(old_dir, old_dentry,
new_dir, new_dentry, flags);
}

View File

@ -16,7 +16,6 @@
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/idr.h>
#include <linux/acct.h> /* acct_auto_close_mnt */
#include <linux/init.h> /* init_rootfs */
#include <linux/fs_struct.h> /* get_fs_root et.al. */
#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
@ -779,6 +778,20 @@ static void attach_mnt(struct mount *mnt,
list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
}
static void attach_shadowed(struct mount *mnt,
struct mount *parent,
struct mount *shadows)
{
if (shadows) {
hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
list_add(&mnt->mnt_child, &shadows->mnt_child);
} else {
hlist_add_head_rcu(&mnt->mnt_hash,
m_hash(&parent->mnt, mnt->mnt_mountpoint));
list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
}
}
/*
* vfsmount lock must be held for write
*/
@ -797,12 +810,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
list_splice(&head, n->list.prev);
if (shadows)
hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
else
hlist_add_head_rcu(&mnt->mnt_hash,
m_hash(&parent->mnt, mnt->mnt_mountpoint));
list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
attach_shadowed(mnt, parent, shadows);
touch_mnt_namespace(n);
}
@ -951,7 +959,6 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
static void mntput_no_expire(struct mount *mnt)
{
put_again:
rcu_read_lock();
mnt_add_count(mnt, -1);
if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
@ -964,14 +971,6 @@ put_again:
unlock_mount_hash();
return;
}
if (unlikely(mnt->mnt_pinned)) {
mnt_add_count(mnt, mnt->mnt_pinned + 1);
mnt->mnt_pinned = 0;
rcu_read_unlock();
unlock_mount_hash();
acct_auto_close_mnt(&mnt->mnt);
goto put_again;
}
if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
rcu_read_unlock();
unlock_mount_hash();
@ -994,6 +993,8 @@ put_again:
* so mnt_get_writers() below is safe.
*/
WARN_ON(mnt_get_writers(mnt));
if (unlikely(mnt->mnt_pins.first))
mnt_pin_kill(mnt);
fsnotify_vfsmount_delete(&mnt->mnt);
dput(mnt->mnt.mnt_root);
deactivate_super(mnt->mnt.mnt_sb);
@ -1021,25 +1022,15 @@ struct vfsmount *mntget(struct vfsmount *mnt)
}
EXPORT_SYMBOL(mntget);
void mnt_pin(struct vfsmount *mnt)
struct vfsmount *mnt_clone_internal(struct path *path)
{
lock_mount_hash();
real_mount(mnt)->mnt_pinned++;
unlock_mount_hash();
struct mount *p;
p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
if (IS_ERR(p))
return ERR_CAST(p);
p->mnt.mnt_flags |= MNT_INTERNAL;
return &p->mnt;
}
EXPORT_SYMBOL(mnt_pin);
void mnt_unpin(struct vfsmount *m)
{
struct mount *mnt = real_mount(m);
lock_mount_hash();
if (mnt->mnt_pinned) {
mnt_add_count(mnt, 1);
mnt->mnt_pinned--;
}
unlock_mount_hash();
}
EXPORT_SYMBOL(mnt_unpin);
static inline void mangle(struct seq_file *m, const char *s)
{
@ -1505,6 +1496,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
continue;
for (s = r; s; s = next_mnt(s, r)) {
struct mount *t = NULL;
if (!(flag & CL_COPY_UNBINDABLE) &&
IS_MNT_UNBINDABLE(s)) {
s = skip_mnt_tree(s);
@ -1526,7 +1518,14 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
goto out;
lock_mount_hash();
list_add_tail(&q->mnt_list, &res->mnt_list);
attach_mnt(q, parent, p->mnt_mp);
mnt_set_mountpoint(parent, p->mnt_mp, q);
if (!list_empty(&parent->mnt_mounts)) {
t = list_last_entry(&parent->mnt_mounts,
struct mount, mnt_child);
if (t->mnt_mp != p->mnt_mp)
t = NULL;
}
attach_shadowed(q, parent, t);
unlock_mount_hash();
}
}

View File

@ -112,7 +112,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
* if the dentry tree reaches them; however if the dentry already
* exists, we'll pick it up at this point and use it as the root
*/
ret = d_obtain_alias(inode);
ret = d_obtain_root(inode);
if (IS_ERR(ret)) {
dprintk("nfs_get_root: get root dentry failed\n");
goto out;

View File

@ -942,7 +942,7 @@ static int nilfs_get_root_dentry(struct super_block *sb,
iput(inode);
}
} else {
dentry = d_obtain_alias(inode);
dentry = d_obtain_root(inode);
if (IS_ERR(dentry)) {
ret = PTR_ERR(dentry);
goto failed_dentry;

View File

@ -22,7 +22,6 @@
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/acct.h>
#include <linux/blkdev.h>
#include <linux/mount.h>
#include <linux/security.h>
@ -702,12 +701,22 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
return -EACCES;
#endif
if (flags & MS_RDONLY)
acct_auto_close(sb);
shrink_dcache_sb(sb);
remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
if (remount_ro) {
if (sb->s_pins.first) {
up_write(&sb->s_umount);
sb_pin_kill(sb);
down_write(&sb->s_umount);
if (!sb->s_root)
return 0;
if (sb->s_writers.frozen != SB_UNFROZEN)
return -EBUSY;
remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
}
}
shrink_dcache_sb(sb);
/* If we are remounting RDONLY and current sb is read/write,
make sure there are no rw files opened */
if (remount_ro) {

View File

@ -24,14 +24,10 @@ struct super_block;
struct pacct_struct;
struct pid_namespace;
extern int acct_parm[]; /* for sysctl */
extern void acct_auto_close_mnt(struct vfsmount *m);
extern void acct_auto_close(struct super_block *sb);
extern void acct_collect(long exitcode, int group_dead);
extern void acct_process(void);
extern void acct_exit_ns(struct pid_namespace *);
#else
#define acct_auto_close_mnt(x) do { } while (0)
#define acct_auto_close(x) do { } while (0)
#define acct_collect(x,y) do { } while (0)
#define acct_process() do { } while (0)
#define acct_exit_ns(ns) do { } while (0)

View File

@ -249,6 +249,7 @@ extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
extern struct dentry *d_find_any_alias(struct inode *inode);
extern struct dentry * d_obtain_alias(struct inode *);
extern struct dentry * d_obtain_root(struct inode *);
extern void shrink_dcache_sb(struct super_block *);
extern void shrink_dcache_parent(struct dentry *);
extern void shrink_dcache_for_umount(struct super_block *);

View File

@ -1275,6 +1275,7 @@ struct super_block {
/* AIO completions deferred from interrupt context */
struct workqueue_struct *s_dio_done_wq;
struct hlist_head s_pins;
/*
* Keep the lru lists last in the structure so they always sit on their
@ -2360,6 +2361,7 @@ extern int do_pipe_flags(int *, int);
extern int kernel_read(struct file *, loff_t, char *, unsigned long);
extern ssize_t kernel_write(struct file *, const char *, size_t, loff_t);
extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
extern struct file * open_exec(const char *);
/* fs/dcache.c -- generic fs support functions */

17
include/linux/fs_pin.h Normal file
View File

@ -0,0 +1,17 @@
#include <linux/fs.h>
struct fs_pin {
atomic_long_t count;
union {
struct {
struct hlist_node s_list;
struct hlist_node m_list;
};
struct rcu_head rcu;
};
void (*kill)(struct fs_pin *);
};
void pin_put(struct fs_pin *);
void pin_remove(struct fs_pin *);
void pin_insert(struct fs_pin *, struct vfsmount *);

View File

@ -69,6 +69,7 @@ struct vfsmount {
};
struct file; /* forward dec */
struct path;
extern int mnt_want_write(struct vfsmount *mnt);
extern int mnt_want_write_file(struct file *file);
@ -77,8 +78,7 @@ extern void mnt_drop_write(struct vfsmount *mnt);
extern void mnt_drop_write_file(struct file *file);
extern void mntput(struct vfsmount *mnt);
extern struct vfsmount *mntget(struct vfsmount *mnt);
extern void mnt_pin(struct vfsmount *mnt);
extern void mnt_unpin(struct vfsmount *mnt);
extern struct vfsmount *mnt_clone_internal(struct path *path);
extern int __mnt_is_readonly(struct vfsmount *mnt);
struct file_system_type;

View File

@ -84,7 +84,7 @@ unsigned long iov_iter_alignment(const struct iov_iter *i);
void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov,
unsigned long nr_segs, size_t count);
ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
size_t maxsize, size_t *start);
unsigned maxpages, size_t *start);
ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages,
size_t maxsize, size_t *start);
int iov_iter_npages(const struct iov_iter *i, int maxpages);

View File

@ -59,6 +59,7 @@
#include <asm/div64.h>
#include <linux/blkdev.h> /* sector_div */
#include <linux/pid_namespace.h>
#include <linux/fs_pin.h>
/*
* These constants control the amount of freespace that suspend and
@ -75,172 +76,190 @@ int acct_parm[3] = {4, 2, 30};
/*
* External references and all of the globals.
*/
static void do_acct_process(struct bsd_acct_struct *acct,
struct pid_namespace *ns, struct file *);
static void do_acct_process(struct bsd_acct_struct *acct);
/*
* This structure is used so that all the data protected by lock
* can be placed in the same cache line as the lock. This primes
* the cache line to have the data after getting the lock.
*/
struct bsd_acct_struct {
struct fs_pin pin;
struct mutex lock;
int active;
unsigned long needcheck;
struct file *file;
struct pid_namespace *ns;
struct list_head list;
struct work_struct work;
struct completion done;
};
static DEFINE_SPINLOCK(acct_lock);
static LIST_HEAD(acct_list);
/*
* Check the amount of free space and suspend/resume accordingly.
*/
static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
static int check_free_space(struct bsd_acct_struct *acct)
{
struct kstatfs sbuf;
int res;
int act;
u64 resume;
u64 suspend;
spin_lock(&acct_lock);
res = acct->active;
if (!file || time_is_before_jiffies(acct->needcheck))
if (time_is_before_jiffies(acct->needcheck))
goto out;
spin_unlock(&acct_lock);
/* May block */
if (vfs_statfs(&file->f_path, &sbuf))
return res;
suspend = sbuf.f_blocks * SUSPEND;
resume = sbuf.f_blocks * RESUME;
do_div(suspend, 100);
do_div(resume, 100);
if (sbuf.f_bavail <= suspend)
act = -1;
else if (sbuf.f_bavail >= resume)
act = 1;
else
act = 0;
/*
* If some joker switched acct->file under us we'ld better be
* silent and _not_ touch anything.
*/
spin_lock(&acct_lock);
if (file != acct->file) {
if (act)
res = act > 0;
if (vfs_statfs(&acct->file->f_path, &sbuf))
goto out;
}
if (acct->active) {
if (act < 0) {
u64 suspend = sbuf.f_blocks * SUSPEND;
do_div(suspend, 100);
if (sbuf.f_bavail <= suspend) {
acct->active = 0;
pr_info("Process accounting paused\n");
}
} else {
if (act > 0) {
u64 resume = sbuf.f_blocks * RESUME;
do_div(resume, 100);
if (sbuf.f_bavail >= resume) {
acct->active = 1;
pr_info("Process accounting resumed\n");
}
}
acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
res = acct->active;
out:
spin_unlock(&acct_lock);
return acct->active;
}
static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
{
struct bsd_acct_struct *res;
again:
smp_rmb();
rcu_read_lock();
res = ACCESS_ONCE(ns->bacct);
if (!res) {
rcu_read_unlock();
return NULL;
}
if (!atomic_long_inc_not_zero(&res->pin.count)) {
rcu_read_unlock();
cpu_relax();
goto again;
}
rcu_read_unlock();
mutex_lock(&res->lock);
if (!res->ns) {
mutex_unlock(&res->lock);
pin_put(&res->pin);
goto again;
}
return res;
}
/*
* Close the old accounting file (if currently open) and then replace
* it with file (if non-NULL).
*
* NOTE: acct_lock MUST be held on entry and exit.
*/
static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
struct pid_namespace *ns)
static void close_work(struct work_struct *work)
{
struct file *old_acct = NULL;
struct pid_namespace *old_ns = NULL;
struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
struct file *file = acct->file;
if (file->f_op->flush)
file->f_op->flush(file, NULL);
__fput_sync(file);
complete(&acct->done);
}
if (acct->file) {
old_acct = acct->file;
old_ns = acct->ns;
acct->active = 0;
acct->file = NULL;
static void acct_kill(struct bsd_acct_struct *acct,
struct bsd_acct_struct *new)
{
if (acct) {
struct pid_namespace *ns = acct->ns;
do_acct_process(acct);
INIT_WORK(&acct->work, close_work);
init_completion(&acct->done);
schedule_work(&acct->work);
wait_for_completion(&acct->done);
pin_remove(&acct->pin);
ns->bacct = new;
acct->ns = NULL;
list_del(&acct->list);
atomic_long_dec(&acct->pin.count);
mutex_unlock(&acct->lock);
pin_put(&acct->pin);
}
if (file) {
acct->file = file;
acct->ns = ns;
acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
acct->active = 1;
list_add(&acct->list, &acct_list);
}
if (old_acct) {
mnt_unpin(old_acct->f_path.mnt);
spin_unlock(&acct_lock);
do_acct_process(acct, old_ns, old_acct);
filp_close(old_acct, NULL);
spin_lock(&acct_lock);
}
static void acct_pin_kill(struct fs_pin *pin)
{
struct bsd_acct_struct *acct;
acct = container_of(pin, struct bsd_acct_struct, pin);
mutex_lock(&acct->lock);
if (!acct->ns) {
mutex_unlock(&acct->lock);
pin_put(pin);
acct = NULL;
}
acct_kill(acct, NULL);
}
static int acct_on(struct filename *pathname)
{
struct file *file;
struct vfsmount *mnt;
struct pid_namespace *ns;
struct bsd_acct_struct *acct = NULL;
struct vfsmount *mnt, *internal;
struct pid_namespace *ns = task_active_pid_ns(current);
struct bsd_acct_struct *acct, *old;
int err;
acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
if (!acct)
return -ENOMEM;
/* Difference from BSD - they don't do O_APPEND */
file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
if (IS_ERR(file))
if (IS_ERR(file)) {
kfree(acct);
return PTR_ERR(file);
}
if (!S_ISREG(file_inode(file)->i_mode)) {
kfree(acct);
filp_close(file, NULL);
return -EACCES;
}
if (!file->f_op->write) {
kfree(acct);
filp_close(file, NULL);
return -EIO;
}
ns = task_active_pid_ns(current);
if (ns->bacct == NULL) {
acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
if (acct == NULL) {
filp_close(file, NULL);
return -ENOMEM;
}
internal = mnt_clone_internal(&file->f_path);
if (IS_ERR(internal)) {
kfree(acct);
filp_close(file, NULL);
return PTR_ERR(internal);
}
spin_lock(&acct_lock);
if (ns->bacct == NULL) {
ns->bacct = acct;
acct = NULL;
err = mnt_want_write(internal);
if (err) {
mntput(internal);
kfree(acct);
filp_close(file, NULL);
return err;
}
mnt = file->f_path.mnt;
mnt_pin(mnt);
acct_file_reopen(ns->bacct, file, ns);
spin_unlock(&acct_lock);
file->f_path.mnt = internal;
mntput(mnt); /* it's pinned, now give up active reference */
kfree(acct);
atomic_long_set(&acct->pin.count, 1);
acct->pin.kill = acct_pin_kill;
acct->file = file;
acct->needcheck = jiffies;
acct->ns = ns;
mutex_init(&acct->lock);
mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */
pin_insert(&acct->pin, mnt);
old = acct_get(ns);
if (old)
acct_kill(old, acct);
else
ns->bacct = acct;
mutex_unlock(&acct->lock);
mnt_drop_write(mnt);
mntput(mnt);
return 0;
}
static DEFINE_MUTEX(acct_on_mutex);
/**
* sys_acct - enable/disable process accounting
* @name: file name for accounting records or NULL to shutdown accounting
@ -264,78 +283,20 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
if (IS_ERR(tmp))
return PTR_ERR(tmp);
mutex_lock(&acct_on_mutex);
error = acct_on(tmp);
mutex_unlock(&acct_on_mutex);
putname(tmp);
} else {
struct bsd_acct_struct *acct;
acct = task_active_pid_ns(current)->bacct;
if (acct == NULL)
return 0;
spin_lock(&acct_lock);
acct_file_reopen(acct, NULL, NULL);
spin_unlock(&acct_lock);
acct_kill(acct_get(task_active_pid_ns(current)), NULL);
}
return error;
}
/**
* acct_auto_close - turn off a filesystem's accounting if it is on
* @m: vfsmount being shut down
*
* If the accounting is turned on for a file in the subtree pointed to
* to by m, turn accounting off. Done when m is about to die.
*/
void acct_auto_close_mnt(struct vfsmount *m)
{
struct bsd_acct_struct *acct;
spin_lock(&acct_lock);
restart:
list_for_each_entry(acct, &acct_list, list)
if (acct->file && acct->file->f_path.mnt == m) {
acct_file_reopen(acct, NULL, NULL);
goto restart;
}
spin_unlock(&acct_lock);
}
/**
* acct_auto_close - turn off a filesystem's accounting if it is on
* @sb: super block for the filesystem
*
* If the accounting is turned on for a file in the filesystem pointed
* to by sb, turn accounting off.
*/
void acct_auto_close(struct super_block *sb)
{
struct bsd_acct_struct *acct;
spin_lock(&acct_lock);
restart:
list_for_each_entry(acct, &acct_list, list)
if (acct->file && acct->file->f_path.dentry->d_sb == sb) {
acct_file_reopen(acct, NULL, NULL);
goto restart;
}
spin_unlock(&acct_lock);
}
void acct_exit_ns(struct pid_namespace *ns)
{
struct bsd_acct_struct *acct = ns->bacct;
if (acct == NULL)
return;
spin_lock(&acct_lock);
if (acct->file != NULL)
acct_file_reopen(acct, NULL, NULL);
spin_unlock(&acct_lock);
kfree(acct);
acct_kill(acct_get(ns), NULL);
}
/*
@ -450,38 +411,20 @@ static u32 encode_float(u64 value)
* do_exit() or when switching to a different output file.
*/
/*
* do_acct_process does all actual work. Caller holds the reference to file.
*/
static void do_acct_process(struct bsd_acct_struct *acct,
struct pid_namespace *ns, struct file *file)
static void fill_ac(acct_t *ac)
{
struct pacct_struct *pacct = &current->signal->pacct;
acct_t ac;
mm_segment_t fs;
unsigned long flim;
u64 elapsed, run_time;
struct tty_struct *tty;
const struct cred *orig_cred;
/* Perform file operations on behalf of whoever enabled accounting */
orig_cred = override_creds(file->f_cred);
/*
* First check to see if there is enough free_space to continue
* the process accounting system.
*/
if (!check_free_space(acct, file))
goto out;
/*
* Fill the accounting struct with the needed info as recorded
* by the different kernel functions.
*/
memset(&ac, 0, sizeof(acct_t));
memset(ac, 0, sizeof(acct_t));
ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER;
strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
/* calculate run_time in nsec*/
run_time = ktime_get_ns();
@ -489,9 +432,9 @@ static void do_acct_process(struct bsd_acct_struct *acct,
/* convert nsec -> AHZ */
elapsed = nsec_to_AHZ(run_time);
#if ACCT_VERSION == 3
ac.ac_etime = encode_float(elapsed);
ac->ac_etime = encode_float(elapsed);
#else
ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
(unsigned long) elapsed : (unsigned long) -1l);
#endif
#if ACCT_VERSION == 1 || ACCT_VERSION == 2
@ -499,18 +442,58 @@ static void do_acct_process(struct bsd_acct_struct *acct,
/* new enlarged etime field */
comp2_t etime = encode_comp2_t(elapsed);
ac.ac_etime_hi = etime >> 16;
ac.ac_etime_lo = (u16) etime;
ac->ac_etime_hi = etime >> 16;
ac->ac_etime_lo = (u16) etime;
}
#endif
do_div(elapsed, AHZ);
ac.ac_btime = get_seconds() - elapsed;
ac->ac_btime = get_seconds() - elapsed;
#if ACCT_VERSION==2
ac->ac_ahz = AHZ;
#endif
spin_lock_irq(&current->sighand->siglock);
tty = current->signal->tty; /* Safe as we hold the siglock */
ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
ac->ac_flag = pacct->ac_flag;
ac->ac_mem = encode_comp_t(pacct->ac_mem);
ac->ac_minflt = encode_comp_t(pacct->ac_minflt);
ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
ac->ac_exitcode = pacct->ac_exitcode;
spin_unlock_irq(&current->sighand->siglock);
}
/*
* do_acct_process does all actual work. Caller holds the reference to file.
*/
static void do_acct_process(struct bsd_acct_struct *acct)
{
acct_t ac;
unsigned long flim;
const struct cred *orig_cred;
struct pid_namespace *ns = acct->ns;
struct file *file = acct->file;
/*
* Accounting records are not subject to resource limits.
*/
flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
/* Perform file operations on behalf of whoever enabled accounting */
orig_cred = override_creds(file->f_cred);
/*
* First check to see if there is enough free_space to continue
* the process accounting system.
*/
if (!check_free_space(acct))
goto out;
fill_ac(&ac);
/* we really need to bite the bullet and change layout */
ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
#if ACCT_VERSION == 2
ac.ac_ahz = AHZ;
#endif
#if ACCT_VERSION == 1 || ACCT_VERSION == 2
/* backward-compatible 16 bit fields */
ac.ac_uid16 = ac.ac_uid;
@ -522,45 +505,18 @@ static void do_acct_process(struct bsd_acct_struct *acct,
ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
rcu_read_unlock();
#endif
spin_lock_irq(&current->sighand->siglock);
tty = current->signal->tty; /* Safe as we hold the siglock */
ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
ac.ac_flag = pacct->ac_flag;
ac.ac_mem = encode_comp_t(pacct->ac_mem);
ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
ac.ac_exitcode = pacct->ac_exitcode;
spin_unlock_irq(&current->sighand->siglock);
ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
ac.ac_swaps = encode_comp_t(0);
/*
* Get freeze protection. If the fs is frozen, just skip the write
* as we could deadlock the system otherwise.
*/
if (!file_start_write_trylock(file))
goto out;
/*
* Kernel segment override to datasegment and write it
* to the accounting file.
*/
fs = get_fs();
set_fs(KERNEL_DS);
/*
* Accounting records are not subject to resource limits.
*/
flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
file->f_op->write(file, (char *)&ac,
sizeof(acct_t), &file->f_pos);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
set_fs(fs);
file_end_write(file);
if (file_start_write_trylock(file)) {
/* it's been opened O_APPEND, so position is irrelevant */
loff_t pos = 0;
__kernel_write(file, (char *)&ac, sizeof(acct_t), &pos);
file_end_write(file);
}
out:
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
revert_creds(orig_cred);
}
@ -609,34 +565,20 @@ void acct_collect(long exitcode, int group_dead)
spin_unlock_irq(&current->sighand->siglock);
}
static void acct_process_in_ns(struct pid_namespace *ns)
static void slow_acct_process(struct pid_namespace *ns)
{
struct file *file = NULL;
struct bsd_acct_struct *acct;
acct = ns->bacct;
/*
* accelerate the common fastpath:
*/
if (!acct || !acct->file)
return;
spin_lock(&acct_lock);
file = acct->file;
if (unlikely(!file)) {
spin_unlock(&acct_lock);
return;
for ( ; ns; ns = ns->parent) {
struct bsd_acct_struct *acct = acct_get(ns);
if (acct) {
do_acct_process(acct);
mutex_unlock(&acct->lock);
pin_put(&acct->pin);
}
}
get_file(file);
spin_unlock(&acct_lock);
do_acct_process(acct, ns, file);
fput(file);
}
/**
* acct_process - now just a wrapper around acct_process_in_ns,
* which in turn is a wrapper around do_acct_process.
* acct_process
*
* handles process accounting for an exiting task
*/
@ -649,6 +591,10 @@ void acct_process(void)
* alive and holds its namespace, which in turn holds
* its parent.
*/
for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent)
acct_process_in_ns(ns);
for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) {
if (ns->bacct)
break;
}
if (unlikely(ns))
slow_acct_process(ns);
}

View File

@ -2602,7 +2602,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
* that this differs from normal direct-io semantics, which
* will return -EFOO even if some bytes were written.
*/
if (unlikely(status < 0) && !written) {
if (unlikely(status < 0)) {
err = status;
goto out;
}

View File

@ -310,7 +310,7 @@ void iov_iter_init(struct iov_iter *i, int direction,
EXPORT_SYMBOL(iov_iter_init);
static ssize_t get_pages_iovec(struct iov_iter *i,
struct page **pages, size_t maxsize,
struct page **pages, unsigned maxpages,
size_t *start)
{
size_t offset = i->iov_offset;
@ -323,10 +323,10 @@ static ssize_t get_pages_iovec(struct iov_iter *i,
len = iov->iov_len - offset;
if (len > i->count)
len = i->count;
if (len > maxsize)
len = maxsize;
addr = (unsigned long)iov->iov_base + offset;
len += *start = addr & (PAGE_SIZE - 1);
if (len > maxpages * PAGE_SIZE)
len = maxpages * PAGE_SIZE;
addr &= ~(PAGE_SIZE - 1);
n = (len + PAGE_SIZE - 1) / PAGE_SIZE;
res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages);
@ -588,15 +588,14 @@ static unsigned long alignment_bvec(const struct iov_iter *i)
}
static ssize_t get_pages_bvec(struct iov_iter *i,
struct page **pages, size_t maxsize,
struct page **pages, unsigned maxpages,
size_t *start)
{
const struct bio_vec *bvec = i->bvec;
size_t len = bvec->bv_len - i->iov_offset;
if (len > i->count)
len = i->count;
if (len > maxsize)
len = maxsize;
/* can't be more than PAGE_SIZE */
*start = bvec->bv_offset + i->iov_offset;
get_page(*pages = bvec->bv_page);
@ -712,13 +711,13 @@ unsigned long iov_iter_alignment(const struct iov_iter *i)
EXPORT_SYMBOL(iov_iter_alignment);
ssize_t iov_iter_get_pages(struct iov_iter *i,
struct page **pages, size_t maxsize,
struct page **pages, unsigned maxpages,
size_t *start)
{
if (i->type & ITER_BVEC)
return get_pages_bvec(i, pages, maxsize, start);
return get_pages_bvec(i, pages, maxpages, start);
else
return get_pages_iovec(i, pages, maxsize, start);
return get_pages_iovec(i, pages, maxpages, start);
}
EXPORT_SYMBOL(iov_iter_get_pages);

View File

@ -2323,17 +2323,45 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
return shmem_unlink(dir, dentry);
}
static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
{
bool old_is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
bool new_is_dir = S_ISDIR(new_dentry->d_inode->i_mode);
if (old_dir != new_dir && old_is_dir != new_is_dir) {
if (old_is_dir) {
drop_nlink(old_dir);
inc_nlink(new_dir);
} else {
drop_nlink(new_dir);
inc_nlink(old_dir);
}
}
old_dir->i_ctime = old_dir->i_mtime =
new_dir->i_ctime = new_dir->i_mtime =
old_dentry->d_inode->i_ctime =
new_dentry->d_inode->i_ctime = CURRENT_TIME;
return 0;
}
/*
* The VFS layer already does all the dentry stuff for rename,
* we just have to decrement the usage count for the target if
* it exists so that the VFS layer correctly free's it when it
* gets overwritten.
*/
static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags)
{
struct inode *inode = old_dentry->d_inode;
int they_are_dirs = S_ISDIR(inode->i_mode);
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
return -EINVAL;
if (flags & RENAME_EXCHANGE)
return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry);
if (!simple_empty(new_dentry))
return -ENOTEMPTY;
@ -3087,7 +3115,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
.mkdir = shmem_mkdir,
.rmdir = shmem_rmdir,
.mknod = shmem_mknod,
.rename = shmem_rename,
.rename2 = shmem_rename2,
.tmpfile = shmem_tmpfile,
#endif
#ifdef CONFIG_TMPFS_XATTR