linux/fs/smb/client/misc.c

1352 lines
35 KiB
C
Raw Normal View History

// SPDX-License-Identifier: LGPL-2.1
/*
*
* Copyright (C) International Business Machines Corp., 2002,2008
* Author(s): Steve French (sfrench@us.ibm.com)
*
*/
#include <linux/slab.h>
#include <linux/ctype.h>
#include <linux/mempool.h>
#include <linux/vmalloc.h>
#include "cifspdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
#include "cifs_debug.h"
#include "smberr.h"
#include "nterr.h"
#include "cifs_unicode.h"
#include "smb2pdu.h"
#include "cifsfs.h"
#ifdef CONFIG_CIFS_DFS_UPCALL
#include "dns_resolve.h"
#include "dfs_cache.h"
cifs: fix use-after-free bug in refresh_cache_worker() The UAF bug occurred because we were putting DFS root sessions in cifs_umount() while DFS cache refresher was being executed. Make DFS root sessions have same lifetime as DFS tcons so we can avoid the use-after-free bug is DFS cache refresher and other places that require IPCs to get new DFS referrals on. Also, get rid of mount group handling in DFS cache as we no longer need it. This fixes below use-after-free bug catched by KASAN [ 379.946955] BUG: KASAN: use-after-free in __refresh_tcon.isra.0+0x10b/0xc10 [cifs] [ 379.947642] Read of size 8 at addr ffff888018f57030 by task kworker/u4:3/56 [ 379.948096] [ 379.948208] CPU: 0 PID: 56 Comm: kworker/u4:3 Not tainted 6.2.0-rc7-lku #23 [ 379.948661] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552-rebuilt.opensuse.org 04/01/2014 [ 379.949368] Workqueue: cifs-dfscache refresh_cache_worker [cifs] [ 379.949942] Call Trace: [ 379.950113] <TASK> [ 379.950260] dump_stack_lvl+0x50/0x67 [ 379.950510] print_report+0x16a/0x48e [ 379.950759] ? __virt_addr_valid+0xd8/0x160 [ 379.951040] ? __phys_addr+0x41/0x80 [ 379.951285] kasan_report+0xdb/0x110 [ 379.951533] ? __refresh_tcon.isra.0+0x10b/0xc10 [cifs] [ 379.952056] ? __refresh_tcon.isra.0+0x10b/0xc10 [cifs] [ 379.952585] __refresh_tcon.isra.0+0x10b/0xc10 [cifs] [ 379.953096] ? __pfx___refresh_tcon.isra.0+0x10/0x10 [cifs] [ 379.953637] ? __pfx___mutex_lock+0x10/0x10 [ 379.953915] ? lock_release+0xb6/0x720 [ 379.954167] ? __pfx_lock_acquire+0x10/0x10 [ 379.954443] ? refresh_cache_worker+0x34e/0x6d0 [cifs] [ 379.954960] ? __pfx_wb_workfn+0x10/0x10 [ 379.955239] refresh_cache_worker+0x4ad/0x6d0 [cifs] [ 379.955755] ? __pfx_refresh_cache_worker+0x10/0x10 [cifs] [ 379.956323] ? __pfx_lock_acquired+0x10/0x10 [ 379.956615] ? read_word_at_a_time+0xe/0x20 [ 379.956898] ? lockdep_hardirqs_on_prepare+0x12/0x220 [ 379.957235] process_one_work+0x535/0x990 [ 379.957509] ? __pfx_process_one_work+0x10/0x10 [ 379.957812] ? lock_acquired+0xb7/0x5f0 [ 379.958069] ? __list_add_valid+0x37/0xd0 [ 379.958341] ? __list_add_valid+0x37/0xd0 [ 379.958611] worker_thread+0x8e/0x630 [ 379.958861] ? __pfx_worker_thread+0x10/0x10 [ 379.959148] kthread+0x17d/0x1b0 [ 379.959369] ? __pfx_kthread+0x10/0x10 [ 379.959630] ret_from_fork+0x2c/0x50 [ 379.959879] </TASK> Signed-off-by: Paulo Alcantara (SUSE) <pc@manguebit.com> Cc: stable@vger.kernel.org # 6.2 Signed-off-by: Steve French <stfrench@microsoft.com>
2023-03-15 07:32:54 +08:00
#include "dfs.h"
#endif
#include "fs_context.h"
#include "cached_dir.h"
extern mempool_t *cifs_sm_req_poolp;
extern mempool_t *cifs_req_poolp;
/* The xid serves as a useful identifier for each incoming vfs request,
in a similar way to the mid which is useful to track each sent smb,
and CurrentXid can also provide a running counter (although it
will eventually wrap past zero) of the total vfs operations handled
since the cifs fs was mounted */
unsigned int
_get_xid(void)
{
unsigned int xid;
spin_lock(&GlobalMid_Lock);
GlobalTotalActiveXid++;
/* keep high water mark for number of simultaneous ops in filesystem */
if (GlobalTotalActiveXid > GlobalMaxActiveXid)
GlobalMaxActiveXid = GlobalTotalActiveXid;
if (GlobalTotalActiveXid > 65000)
cifs_dbg(FYI, "warning: more than 65000 requests active\n");
xid = GlobalCurrentXid++;
spin_unlock(&GlobalMid_Lock);
return xid;
}
void
_free_xid(unsigned int xid)
{
spin_lock(&GlobalMid_Lock);
/* if (GlobalTotalActiveXid == 0)
BUG(); */
GlobalTotalActiveXid--;
spin_unlock(&GlobalMid_Lock);
}
struct cifs_ses *
sesInfoAlloc(void)
{
struct cifs_ses *ret_buf;
ret_buf = kzalloc(sizeof(struct cifs_ses), GFP_KERNEL);
if (ret_buf) {
atomic_inc(&sesInfoAllocCount);
spin_lock_init(&ret_buf->ses_lock);
ret_buf->ses_status = SES_NEW;
++ret_buf->ses_count;
INIT_LIST_HEAD(&ret_buf->smb_ses_list);
INIT_LIST_HEAD(&ret_buf->tcon_list);
mutex_init(&ret_buf->session_mutex);
spin_lock_init(&ret_buf->iface_lock);
INIT_LIST_HEAD(&ret_buf->iface_list);
spin_lock_init(&ret_buf->chan_lock);
}
return ret_buf;
}
void
sesInfoFree(struct cifs_ses *buf_to_free)
{
struct cifs_server_iface *iface = NULL, *niface = NULL;
if (buf_to_free == NULL) {
cifs_dbg(FYI, "Null buffer passed to sesInfoFree\n");
return;
}
atomic_dec(&sesInfoAllocCount);
kfree(buf_to_free->serverOS);
kfree(buf_to_free->serverDomain);
kfree(buf_to_free->serverNOS);
mm, treewide: rename kzfree() to kfree_sensitive() As said by Linus: A symmetric naming is only helpful if it implies symmetries in use. Otherwise it's actively misleading. In "kzalloc()", the z is meaningful and an important part of what the caller wants. In "kzfree()", the z is actively detrimental, because maybe in the future we really _might_ want to use that "memfill(0xdeadbeef)" or something. The "zero" part of the interface isn't even _relevant_. The main reason that kzfree() exists is to clear sensitive information that should not be leaked to other future users of the same memory objects. Rename kzfree() to kfree_sensitive() to follow the example of the recently added kvfree_sensitive() and make the intention of the API more explicit. In addition, memzero_explicit() is used to clear the memory to make sure that it won't get optimized away by the compiler. The renaming is done by using the command sequence: git grep -w --name-only kzfree |\ xargs sed -i 's/kzfree/kfree_sensitive/' followed by some editing of the kfree_sensitive() kerneldoc and adding a kzfree backward compatibility macro in slab.h. [akpm@linux-foundation.org: fs/crypto/inline_crypt.c needs linux/slab.h] [akpm@linux-foundation.org: fix fs/crypto/inline_crypt.c some more] Suggested-by: Joe Perches <joe@perches.com> Signed-off-by: Waiman Long <longman@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Acked-by: David Howells <dhowells@redhat.com> Acked-by: Michal Hocko <mhocko@suse.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com> Cc: James Morris <jmorris@namei.org> Cc: "Serge E. Hallyn" <serge@hallyn.com> Cc: Joe Perches <joe@perches.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: David Rientjes <rientjes@google.com> Cc: Dan Carpenter <dan.carpenter@oracle.com> Cc: "Jason A . Donenfeld" <Jason@zx2c4.com> Link: http://lkml.kernel.org/r/20200616154311.12314-3-longman@redhat.com Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-08-07 14:18:13 +08:00
kfree_sensitive(buf_to_free->password);
kfree(buf_to_free->user_name);
kfree(buf_to_free->domainName);
mm, treewide: rename kzfree() to kfree_sensitive() As said by Linus: A symmetric naming is only helpful if it implies symmetries in use. Otherwise it's actively misleading. In "kzalloc()", the z is meaningful and an important part of what the caller wants. In "kzfree()", the z is actively detrimental, because maybe in the future we really _might_ want to use that "memfill(0xdeadbeef)" or something. The "zero" part of the interface isn't even _relevant_. The main reason that kzfree() exists is to clear sensitive information that should not be leaked to other future users of the same memory objects. Rename kzfree() to kfree_sensitive() to follow the example of the recently added kvfree_sensitive() and make the intention of the API more explicit. In addition, memzero_explicit() is used to clear the memory to make sure that it won't get optimized away by the compiler. The renaming is done by using the command sequence: git grep -w --name-only kzfree |\ xargs sed -i 's/kzfree/kfree_sensitive/' followed by some editing of the kfree_sensitive() kerneldoc and adding a kzfree backward compatibility macro in slab.h. [akpm@linux-foundation.org: fs/crypto/inline_crypt.c needs linux/slab.h] [akpm@linux-foundation.org: fix fs/crypto/inline_crypt.c some more] Suggested-by: Joe Perches <joe@perches.com> Signed-off-by: Waiman Long <longman@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Acked-by: David Howells <dhowells@redhat.com> Acked-by: Michal Hocko <mhocko@suse.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com> Cc: James Morris <jmorris@namei.org> Cc: "Serge E. Hallyn" <serge@hallyn.com> Cc: Joe Perches <joe@perches.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: David Rientjes <rientjes@google.com> Cc: Dan Carpenter <dan.carpenter@oracle.com> Cc: "Jason A . Donenfeld" <Jason@zx2c4.com> Link: http://lkml.kernel.org/r/20200616154311.12314-3-longman@redhat.com Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-08-07 14:18:13 +08:00
kfree_sensitive(buf_to_free->auth_key.response);
spin_lock(&buf_to_free->iface_lock);
list_for_each_entry_safe(iface, niface, &buf_to_free->iface_list,
iface_head)
kref_put(&iface->refcount, release_iface);
spin_unlock(&buf_to_free->iface_lock);
mm, treewide: rename kzfree() to kfree_sensitive() As said by Linus: A symmetric naming is only helpful if it implies symmetries in use. Otherwise it's actively misleading. In "kzalloc()", the z is meaningful and an important part of what the caller wants. In "kzfree()", the z is actively detrimental, because maybe in the future we really _might_ want to use that "memfill(0xdeadbeef)" or something. The "zero" part of the interface isn't even _relevant_. The main reason that kzfree() exists is to clear sensitive information that should not be leaked to other future users of the same memory objects. Rename kzfree() to kfree_sensitive() to follow the example of the recently added kvfree_sensitive() and make the intention of the API more explicit. In addition, memzero_explicit() is used to clear the memory to make sure that it won't get optimized away by the compiler. The renaming is done by using the command sequence: git grep -w --name-only kzfree |\ xargs sed -i 's/kzfree/kfree_sensitive/' followed by some editing of the kfree_sensitive() kerneldoc and adding a kzfree backward compatibility macro in slab.h. [akpm@linux-foundation.org: fs/crypto/inline_crypt.c needs linux/slab.h] [akpm@linux-foundation.org: fix fs/crypto/inline_crypt.c some more] Suggested-by: Joe Perches <joe@perches.com> Signed-off-by: Waiman Long <longman@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Acked-by: David Howells <dhowells@redhat.com> Acked-by: Michal Hocko <mhocko@suse.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com> Cc: James Morris <jmorris@namei.org> Cc: "Serge E. Hallyn" <serge@hallyn.com> Cc: Joe Perches <joe@perches.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: David Rientjes <rientjes@google.com> Cc: Dan Carpenter <dan.carpenter@oracle.com> Cc: "Jason A . Donenfeld" <Jason@zx2c4.com> Link: http://lkml.kernel.org/r/20200616154311.12314-3-longman@redhat.com Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-08-07 14:18:13 +08:00
kfree_sensitive(buf_to_free);
}
struct cifs_tcon *
tconInfoAlloc(void)
{
struct cifs_tcon *ret_buf;
ret_buf = kzalloc(sizeof(*ret_buf), GFP_KERNEL);
if (!ret_buf)
return NULL;
ret_buf->cfids = init_cached_dirs();
if (!ret_buf->cfids) {
kfree(ret_buf);
return NULL;
}
atomic_inc(&tconInfoAllocCount);
ret_buf->status = TID_NEW;
++ret_buf->tc_count;
spin_lock_init(&ret_buf->tc_lock);
INIT_LIST_HEAD(&ret_buf->openFileList);
INIT_LIST_HEAD(&ret_buf->tcon_list);
spin_lock_init(&ret_buf->open_file_lock);
spin_lock_init(&ret_buf->stat_lock);
atomic_set(&ret_buf->num_local_opens, 0);
atomic_set(&ret_buf->num_remote_opens, 0);
cifs: fix use-after-free bug in refresh_cache_worker() The UAF bug occurred because we were putting DFS root sessions in cifs_umount() while DFS cache refresher was being executed. Make DFS root sessions have same lifetime as DFS tcons so we can avoid the use-after-free bug is DFS cache refresher and other places that require IPCs to get new DFS referrals on. Also, get rid of mount group handling in DFS cache as we no longer need it. This fixes below use-after-free bug catched by KASAN [ 379.946955] BUG: KASAN: use-after-free in __refresh_tcon.isra.0+0x10b/0xc10 [cifs] [ 379.947642] Read of size 8 at addr ffff888018f57030 by task kworker/u4:3/56 [ 379.948096] [ 379.948208] CPU: 0 PID: 56 Comm: kworker/u4:3 Not tainted 6.2.0-rc7-lku #23 [ 379.948661] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552-rebuilt.opensuse.org 04/01/2014 [ 379.949368] Workqueue: cifs-dfscache refresh_cache_worker [cifs] [ 379.949942] Call Trace: [ 379.950113] <TASK> [ 379.950260] dump_stack_lvl+0x50/0x67 [ 379.950510] print_report+0x16a/0x48e [ 379.950759] ? __virt_addr_valid+0xd8/0x160 [ 379.951040] ? __phys_addr+0x41/0x80 [ 379.951285] kasan_report+0xdb/0x110 [ 379.951533] ? __refresh_tcon.isra.0+0x10b/0xc10 [cifs] [ 379.952056] ? __refresh_tcon.isra.0+0x10b/0xc10 [cifs] [ 379.952585] __refresh_tcon.isra.0+0x10b/0xc10 [cifs] [ 379.953096] ? __pfx___refresh_tcon.isra.0+0x10/0x10 [cifs] [ 379.953637] ? __pfx___mutex_lock+0x10/0x10 [ 379.953915] ? lock_release+0xb6/0x720 [ 379.954167] ? __pfx_lock_acquire+0x10/0x10 [ 379.954443] ? refresh_cache_worker+0x34e/0x6d0 [cifs] [ 379.954960] ? __pfx_wb_workfn+0x10/0x10 [ 379.955239] refresh_cache_worker+0x4ad/0x6d0 [cifs] [ 379.955755] ? __pfx_refresh_cache_worker+0x10/0x10 [cifs] [ 379.956323] ? __pfx_lock_acquired+0x10/0x10 [ 379.956615] ? read_word_at_a_time+0xe/0x20 [ 379.956898] ? lockdep_hardirqs_on_prepare+0x12/0x220 [ 379.957235] process_one_work+0x535/0x990 [ 379.957509] ? __pfx_process_one_work+0x10/0x10 [ 379.957812] ? lock_acquired+0xb7/0x5f0 [ 379.958069] ? __list_add_valid+0x37/0xd0 [ 379.958341] ? __list_add_valid+0x37/0xd0 [ 379.958611] worker_thread+0x8e/0x630 [ 379.958861] ? __pfx_worker_thread+0x10/0x10 [ 379.959148] kthread+0x17d/0x1b0 [ 379.959369] ? __pfx_kthread+0x10/0x10 [ 379.959630] ret_from_fork+0x2c/0x50 [ 379.959879] </TASK> Signed-off-by: Paulo Alcantara (SUSE) <pc@manguebit.com> Cc: stable@vger.kernel.org # 6.2 Signed-off-by: Steve French <stfrench@microsoft.com>
2023-03-15 07:32:54 +08:00
#ifdef CONFIG_CIFS_DFS_UPCALL
INIT_LIST_HEAD(&ret_buf->dfs_ses_list);
#endif
return ret_buf;
}
void
tconInfoFree(struct cifs_tcon *tcon)
{
if (tcon == NULL) {
cifs_dbg(FYI, "Null buffer passed to tconInfoFree\n");
return;
}
free_cached_dirs(tcon->cfids);
atomic_dec(&tconInfoAllocCount);
kfree(tcon->nativeFileSystem);
kfree_sensitive(tcon->password);
cifs: fix use-after-free bug in refresh_cache_worker() The UAF bug occurred because we were putting DFS root sessions in cifs_umount() while DFS cache refresher was being executed. Make DFS root sessions have same lifetime as DFS tcons so we can avoid the use-after-free bug is DFS cache refresher and other places that require IPCs to get new DFS referrals on. Also, get rid of mount group handling in DFS cache as we no longer need it. This fixes below use-after-free bug catched by KASAN [ 379.946955] BUG: KASAN: use-after-free in __refresh_tcon.isra.0+0x10b/0xc10 [cifs] [ 379.947642] Read of size 8 at addr ffff888018f57030 by task kworker/u4:3/56 [ 379.948096] [ 379.948208] CPU: 0 PID: 56 Comm: kworker/u4:3 Not tainted 6.2.0-rc7-lku #23 [ 379.948661] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552-rebuilt.opensuse.org 04/01/2014 [ 379.949368] Workqueue: cifs-dfscache refresh_cache_worker [cifs] [ 379.949942] Call Trace: [ 379.950113] <TASK> [ 379.950260] dump_stack_lvl+0x50/0x67 [ 379.950510] print_report+0x16a/0x48e [ 379.950759] ? __virt_addr_valid+0xd8/0x160 [ 379.951040] ? __phys_addr+0x41/0x80 [ 379.951285] kasan_report+0xdb/0x110 [ 379.951533] ? __refresh_tcon.isra.0+0x10b/0xc10 [cifs] [ 379.952056] ? __refresh_tcon.isra.0+0x10b/0xc10 [cifs] [ 379.952585] __refresh_tcon.isra.0+0x10b/0xc10 [cifs] [ 379.953096] ? __pfx___refresh_tcon.isra.0+0x10/0x10 [cifs] [ 379.953637] ? __pfx___mutex_lock+0x10/0x10 [ 379.953915] ? lock_release+0xb6/0x720 [ 379.954167] ? __pfx_lock_acquire+0x10/0x10 [ 379.954443] ? refresh_cache_worker+0x34e/0x6d0 [cifs] [ 379.954960] ? __pfx_wb_workfn+0x10/0x10 [ 379.955239] refresh_cache_worker+0x4ad/0x6d0 [cifs] [ 379.955755] ? __pfx_refresh_cache_worker+0x10/0x10 [cifs] [ 379.956323] ? __pfx_lock_acquired+0x10/0x10 [ 379.956615] ? read_word_at_a_time+0xe/0x20 [ 379.956898] ? lockdep_hardirqs_on_prepare+0x12/0x220 [ 379.957235] process_one_work+0x535/0x990 [ 379.957509] ? __pfx_process_one_work+0x10/0x10 [ 379.957812] ? lock_acquired+0xb7/0x5f0 [ 379.958069] ? __list_add_valid+0x37/0xd0 [ 379.958341] ? __list_add_valid+0x37/0xd0 [ 379.958611] worker_thread+0x8e/0x630 [ 379.958861] ? __pfx_worker_thread+0x10/0x10 [ 379.959148] kthread+0x17d/0x1b0 [ 379.959369] ? __pfx_kthread+0x10/0x10 [ 379.959630] ret_from_fork+0x2c/0x50 [ 379.959879] </TASK> Signed-off-by: Paulo Alcantara (SUSE) <pc@manguebit.com> Cc: stable@vger.kernel.org # 6.2 Signed-off-by: Steve French <stfrench@microsoft.com>
2023-03-15 07:32:54 +08:00
#ifdef CONFIG_CIFS_DFS_UPCALL
dfs_put_root_smb_sessions(&tcon->dfs_ses_list);
#endif
kfree(tcon->origin_fullpath);
kfree(tcon);
}
struct smb_hdr *
cifs_buf_get(void)
{
struct smb_hdr *ret_buf = NULL;
/*
* SMB2 header is bigger than CIFS one - no problems to clean some
* more bytes for CIFS.
*/
size_t buf_size = sizeof(struct smb2_hdr);
/*
* We could use negotiated size instead of max_msgsize -
* but it may be more efficient to always alloc same size
* albeit slightly larger than necessary and maxbuffersize
* defaults to this and can not be bigger.
*/
ret_buf = mempool_alloc(cifs_req_poolp, GFP_NOFS);
/* clear the first few header bytes */
/* for most paths, more is cleared in header_assemble */
memset(ret_buf, 0, buf_size + 3);
atomic_inc(&buf_alloc_count);
#ifdef CONFIG_CIFS_STATS2
atomic_inc(&total_buf_alloc_count);
#endif /* CONFIG_CIFS_STATS2 */
return ret_buf;
}
void
cifs_buf_release(void *buf_to_free)
{
if (buf_to_free == NULL) {
/* cifs_dbg(FYI, "Null buffer passed to cifs_buf_release\n");*/
return;
}
mempool_free(buf_to_free, cifs_req_poolp);
atomic_dec(&buf_alloc_count);
return;
}
struct smb_hdr *
cifs_small_buf_get(void)
{
struct smb_hdr *ret_buf = NULL;
/* We could use negotiated size instead of max_msgsize -
but it may be more efficient to always alloc same size
albeit slightly larger than necessary and maxbuffersize
defaults to this and can not be bigger */
ret_buf = mempool_alloc(cifs_sm_req_poolp, GFP_NOFS);
/* No need to clear memory here, cleared in header assemble */
/* memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/
atomic_inc(&small_buf_alloc_count);
#ifdef CONFIG_CIFS_STATS2
atomic_inc(&total_small_buf_alloc_count);
#endif /* CONFIG_CIFS_STATS2 */
return ret_buf;
}
void
cifs_small_buf_release(void *buf_to_free)
{
if (buf_to_free == NULL) {
cifs_dbg(FYI, "Null buffer passed to cifs_small_buf_release\n");
return;
}
mempool_free(buf_to_free, cifs_sm_req_poolp);
atomic_dec(&small_buf_alloc_count);
return;
}
void
free_rsp_buf(int resp_buftype, void *rsp)
{
if (resp_buftype == CIFS_SMALL_BUFFER)
cifs_small_buf_release(rsp);
else if (resp_buftype == CIFS_LARGE_BUFFER)
cifs_buf_release(rsp);
}
/* NB: MID can not be set if treeCon not passed in, in that
case it is responsbility of caller to set the mid */
void
header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
const struct cifs_tcon *treeCon, int word_count
/* length of fixed section (word count) in two byte units */)
{
char *temp = (char *) buffer;
memset(temp, 0, 256); /* bigger than MAX_CIFS_HDR_SIZE */
buffer->smb_buf_length = cpu_to_be32(
(2 * word_count) + sizeof(struct smb_hdr) -
4 /* RFC 1001 length field does not count */ +
2 /* for bcc field itself */) ;
buffer->Protocol[0] = 0xFF;
buffer->Protocol[1] = 'S';
buffer->Protocol[2] = 'M';
buffer->Protocol[3] = 'B';
buffer->Command = smb_command;
buffer->Flags = 0x00; /* case sensitive */
buffer->Flags2 = SMBFLG2_KNOWS_LONG_NAMES;
buffer->Pid = cpu_to_le16((__u16)current->tgid);
buffer->PidHigh = cpu_to_le16((__u16)(current->tgid >> 16));
if (treeCon) {
buffer->Tid = treeCon->tid;
if (treeCon->ses) {
if (treeCon->ses->capabilities & CAP_UNICODE)
buffer->Flags2 |= SMBFLG2_UNICODE;
if (treeCon->ses->capabilities & CAP_STATUS32)
buffer->Flags2 |= SMBFLG2_ERR_STATUS;
/* Uid is not converted */
buffer->Uid = treeCon->ses->Suid;
if (treeCon->ses->server)
buffer->Mid = get_next_mid(treeCon->ses->server);
}
if (treeCon->Flags & SMB_SHARE_IS_IN_DFS)
buffer->Flags2 |= SMBFLG2_DFS;
if (treeCon->nocase)
buffer->Flags |= SMBFLG_CASELESS;
if ((treeCon->ses) && (treeCon->ses->server))
if (treeCon->ses->server->sign)
buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
}
/* endian conversion of flags is now done just before sending */
buffer->WordCount = (char) word_count;
return;
}
static int
check_smb_hdr(struct smb_hdr *smb)
{
/* does it have the right SMB "signature" ? */
if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) {
cifs_dbg(VFS, "Bad protocol string signature header 0x%x\n",
*(unsigned int *)smb->Protocol);
return 1;
}
/* if it's a response then accept */
if (smb->Flags & SMBFLG_RESPONSE)
return 0;
/* only one valid case where server sends us request */
if (smb->Command == SMB_COM_LOCKING_ANDX)
return 0;
cifs_dbg(VFS, "Server sent request, not response. mid=%u\n",
get_mid(smb));
return 1;
}
int
checkSMB(char *buf, unsigned int total_read, struct TCP_Server_Info *server)
{
struct smb_hdr *smb = (struct smb_hdr *)buf;
__u32 rfclen = be32_to_cpu(smb->smb_buf_length);
__u32 clc_len; /* calculated length */
cifs_dbg(FYI, "checkSMB Length: 0x%x, smb_buf_length: 0x%x\n",
total_read, rfclen);
/* is this frame too small to even get to a BCC? */
if (total_read < 2 + sizeof(struct smb_hdr)) {
if ((total_read >= sizeof(struct smb_hdr) - 1)
&& (smb->Status.CifsError != 0)) {
/* it's an error return */
smb->WordCount = 0;
/* some error cases do not return wct and bcc */
return 0;
} else if ((total_read == sizeof(struct smb_hdr) + 1) &&
(smb->WordCount == 0)) {
char *tmp = (char *)smb;
/* Need to work around a bug in two servers here */
/* First, check if the part of bcc they sent was zero */
if (tmp[sizeof(struct smb_hdr)] == 0) {
/* some servers return only half of bcc
* on simple responses (wct, bcc both zero)
* in particular have seen this on
* ulogoffX and FindClose. This leaves
* one byte of bcc potentially unitialized
*/
/* zero rest of bcc */
tmp[sizeof(struct smb_hdr)+1] = 0;
return 0;
}
cifs_dbg(VFS, "rcvd invalid byte count (bcc)\n");
} else {
cifs_dbg(VFS, "Length less than smb header size\n");
}
return -EIO;
}
/* otherwise, there is enough to get to the BCC */
if (check_smb_hdr(smb))
return -EIO;
clc_len = smbCalcSize(smb);
if (4 + rfclen != total_read) {
cifs_dbg(VFS, "Length read does not match RFC1001 length %d\n",
rfclen);
return -EIO;
}
if (4 + rfclen != clc_len) {
__u16 mid = get_mid(smb);
/* check if bcc wrapped around for large read responses */
if ((rfclen > 64 * 1024) && (rfclen > clc_len)) {
/* check if lengths match mod 64K */
if (((4 + rfclen) & 0xFFFF) == (clc_len & 0xFFFF))
return 0; /* bcc wrapped */
}
cifs_dbg(FYI, "Calculated size %u vs length %u mismatch for mid=%u\n",
clc_len, 4 + rfclen, mid);
if (4 + rfclen < clc_len) {
cifs_dbg(VFS, "RFC1001 size %u smaller than SMB for mid=%u\n",
rfclen, mid);
return -EIO;
} else if (rfclen > clc_len + 512) {
/*
* Some servers (Windows XP in particular) send more
* data than the lengths in the SMB packet would
* indicate on certain calls (byte range locks and
* trans2 find first calls in particular). While the
* client can handle such a frame by ignoring the
* trailing data, we choose limit the amount of extra
* data to 512 bytes.
*/
cifs_dbg(VFS, "RFC1001 size %u more than 512 bytes larger than SMB for mid=%u\n",
rfclen, mid);
return -EIO;
}
}
return 0;
}
bool
is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
{
struct smb_hdr *buf = (struct smb_hdr *)buffer;
struct smb_com_lock_req *pSMB = (struct smb_com_lock_req *)buf;
struct TCP_Server_Info *pserver;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
struct cifsInodeInfo *pCifsInode;
struct cifsFileInfo *netfile;
cifs_dbg(FYI, "Checking for oplock break or dnotify response\n");
if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) &&
(pSMB->hdr.Flags & SMBFLG_RESPONSE)) {
struct smb_com_transaction_change_notify_rsp *pSMBr =
(struct smb_com_transaction_change_notify_rsp *)buf;
struct file_notify_information *pnotify;
__u32 data_offset = 0;
size_t len = srv->total_read - sizeof(pSMBr->hdr.smb_buf_length);
if (get_bcc(buf) > sizeof(struct file_notify_information)) {
data_offset = le32_to_cpu(pSMBr->DataOffset);
if (data_offset >
len - sizeof(struct file_notify_information)) {
cifs_dbg(FYI, "Invalid data_offset %u\n",
data_offset);
return true;
}
pnotify = (struct file_notify_information *)
((char *)&pSMBr->hdr.Protocol + data_offset);
cifs_dbg(FYI, "dnotify on %s Action: 0x%x\n",
pnotify->FileName, pnotify->Action);
/* cifs_dump_mem("Rcvd notify Data: ",buf,
sizeof(struct smb_hdr)+60); */
return true;
}
if (pSMBr->hdr.Status.CifsError) {
cifs_dbg(FYI, "notify err 0x%x\n",
pSMBr->hdr.Status.CifsError);
return true;
}
return false;
}
if (pSMB->hdr.Command != SMB_COM_LOCKING_ANDX)
return false;
if (pSMB->hdr.Flags & SMBFLG_RESPONSE) {
/* no sense logging error on invalid handle on oplock
break - harmless race between close request and oplock
break response is expected from time to time writing out
large dirty files cached on the client */
if ((NT_STATUS_INVALID_HANDLE) ==
le32_to_cpu(pSMB->hdr.Status.CifsError)) {
cifs_dbg(FYI, "Invalid handle on oplock break\n");
return true;
} else if (ERRbadfid ==
le16_to_cpu(pSMB->hdr.Status.DosError.Error)) {
return true;
} else {
return false; /* on valid oplock brk we get "request" */
}
}
if (pSMB->hdr.WordCount != 8)
return false;
cifs_dbg(FYI, "oplock type 0x%x level 0x%x\n",
pSMB->LockType, pSMB->OplockLevel);
if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE))
return false;
/* If server is a channel, select the primary channel */
pserver = CIFS_SERVER_IS_CHAN(srv) ? srv->primary_server : srv;
/* look up tcon based on tid & uid */
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
if (tcon->tid != buf->Tid)
continue;
cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks);
spin_lock(&tcon->open_file_lock);
list_for_each_entry(netfile, &tcon->openFileList, tlist) {
if (pSMB->Fid != netfile->fid.netfid)
continue;
cifs_dbg(FYI, "file id match, oplock break\n");
pCifsInode = CIFS_I(d_inode(netfile->dentry));
cifs: Wait for writebacks to complete before attempting write. Problem reported in Red Hat bz 1040329 for strict writes where we cache only when we hold oplock and write direct to the server when we don't. When we receive an oplock break, we first change the oplock value for the inode in cifsInodeInfo->oplock to indicate that we no longer hold the oplock before we enqueue a task to flush changes to the backing device. Once we have completed flushing the changes, we return the oplock to the server. There are 2 ways here where we can have data corruption 1) While we flush changes to the backing device as part of the oplock break, we can have processes write to the file. These writes check for the oplock, find none and attempt to write directly to the server. These direct writes made while we are flushing from cache could be overwritten by data being flushed from the cache causing data corruption. 2) While a thread runs in cifs_strict_writev, the machine could receive and process an oplock break after the thread has checked the oplock and found that it allows us to cache and before we have made changes to the cache. In that case, we end up with a dirty page in cache when we shouldn't have any. This will be flushed later and will overwrite all subsequent writes to the part of the file represented by this page. Before making any writes to the server, we need to confirm that we are not in the process of flushing data to the server and if we are, we should wait until the process is complete before we attempt the write. We should also wait for existing writes to complete before we process an oplock break request which changes oplock values. We add a version specific downgrade_oplock() operation to allow for differences in the oplock values set for the different smb versions. Cc: stable@vger.kernel.org Signed-off-by: Sachin Prabhu <sprabhu@redhat.com> Reviewed-by: Jeff Layton <jlayton@redhat.com> Reviewed-by: Pavel Shilovsky <piastry@etersoft.ru> Signed-off-by: Steve French <smfrench@gmail.com>
2014-03-12 00:11:47 +08:00
set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK,
&pCifsInode->flags);
netfile->oplock_epoch = 0;
netfile->oplock_level = pSMB->OplockLevel;
netfile->oplock_break_cancelled = false;
cifs_queue_oplock_break(netfile);
spin_unlock(&tcon->open_file_lock);
spin_unlock(&cifs_tcp_ses_lock);
return true;
}
spin_unlock(&tcon->open_file_lock);
spin_unlock(&cifs_tcp_ses_lock);
cifs_dbg(FYI, "No matching file for oplock break\n");
return true;
}
}
spin_unlock(&cifs_tcp_ses_lock);
cifs_dbg(FYI, "Can not process oplock break for non-existent connection\n");
return true;
}
void
dump_smb(void *buf, int smb_buf_length)
{
if (traceSMB == 0)
return;
print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_NONE, 8, 2, buf,
smb_buf_length, true);
}
void
cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
{
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
struct cifs_tcon *tcon = NULL;
if (cifs_sb->master_tlink)
tcon = cifs_sb_master_tcon(cifs_sb);
cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
cifs_sb->mnt_cifs_serverino_autodisabled = true;
cifs_dbg(VFS, "Autodisabling the use of server inode numbers on %s\n",
tcon ? tcon->tree_name : "new server");
cifs_dbg(VFS, "The server doesn't seem to support them properly or the files might be on different servers (DFS)\n");
cifs_dbg(VFS, "Hardlinks will not be recognized on this mount. Consider mounting with the \"noserverino\" option to silence this message.\n");
}
}
void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
{
oplock &= 0xF;
if (oplock == OPLOCK_EXCLUSIVE) {
cinode->oplock = CIFS_CACHE_WRITE_FLG | CIFS_CACHE_READ_FLG;
cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n",
netfs: Fix gcc-12 warning by embedding vfs inode in netfs_i_context While randstruct was satisfied with using an open-coded "void *" offset cast for the netfs_i_context <-> inode casting, __builtin_object_size() as used by FORTIFY_SOURCE was not as easily fooled. This was causing the following complaint[1] from gcc v12: In file included from include/linux/string.h:253, from include/linux/ceph/ceph_debug.h:7, from fs/ceph/inode.c:2: In function 'fortify_memset_chk', inlined from 'netfs_i_context_init' at include/linux/netfs.h:326:2, inlined from 'ceph_alloc_inode' at fs/ceph/inode.c:463:2: include/linux/fortify-string.h:242:25: warning: call to '__write_overflow_field' declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Wattribute-warning] 242 | __write_overflow_field(p_size_field, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix this by embedding a struct inode into struct netfs_i_context (which should perhaps be renamed to struct netfs_inode). The struct inode vfs_inode fields are then removed from the 9p, afs, ceph and cifs inode structs and vfs_inode is then simply changed to "netfs.inode" in those filesystems. Further, rename netfs_i_context to netfs_inode, get rid of the netfs_inode() function that converted a netfs_i_context pointer to an inode pointer (that can now be done with &ctx->inode) and rename the netfs_i_context() function to netfs_inode() (which is now a wrapper around container_of()). Most of the changes were done with: perl -p -i -e 's/vfs_inode/netfs.inode/'g \ `git grep -l 'vfs_inode' -- fs/{9p,afs,ceph,cifs}/*.[ch]` Kees suggested doing it with a pair structure[2] and a special declarator to insert that into the network filesystem's inode wrapper[3], but I think it's cleaner to embed it - and then it doesn't matter if struct randomisation reorders things. Dave Chinner suggested using a filesystem-specific VFS_I() function in each filesystem to convert that filesystem's own inode wrapper struct into the VFS inode struct[4]. Version #2: - Fix a couple of missed name changes due to a disabled cifs option. - Rename nfs_i_context to nfs_inode - Use "netfs" instead of "nic" as the member name in per-fs inode wrapper structs. [ This also undoes commit 507160f46c55 ("netfs: gcc-12: temporarily disable '-Wattribute-warning' for now") that is no longer needed ] Fixes: bc899ee1c898 ("netfs: Add a netfs inode context") Reported-by: Jeff Layton <jlayton@kernel.org> Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> Reviewed-by: Kees Cook <keescook@chromium.org> Reviewed-by: Xiubo Li <xiubli@redhat.com> cc: Jonathan Corbet <corbet@lwn.net> cc: Eric Van Hensbergen <ericvh@gmail.com> cc: Latchesar Ionkov <lucho@ionkov.net> cc: Dominique Martinet <asmadeus@codewreck.org> cc: Christian Schoenebeck <linux_oss@crudebyte.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: Ilya Dryomov <idryomov@gmail.com> cc: Steve French <smfrench@gmail.com> cc: William Kucharski <william.kucharski@oracle.com> cc: "Matthew Wilcox (Oracle)" <willy@infradead.org> cc: Dave Chinner <david@fromorbit.com> cc: linux-doc@vger.kernel.org cc: v9fs-developer@lists.sourceforge.net cc: linux-afs@lists.infradead.org cc: ceph-devel@vger.kernel.org cc: linux-cifs@vger.kernel.org cc: samba-technical@lists.samba.org cc: linux-fsdevel@vger.kernel.org cc: linux-hardening@vger.kernel.org Link: https://lore.kernel.org/r/d2ad3a3d7bdd794c6efb562d2f2b655fb67756b9.camel@kernel.org/ [1] Link: https://lore.kernel.org/r/20220517210230.864239-1-keescook@chromium.org/ [2] Link: https://lore.kernel.org/r/20220518202212.2322058-1-keescook@chromium.org/ [3] Link: https://lore.kernel.org/r/20220524101205.GI2306852@dread.disaster.area/ [4] Link: https://lore.kernel.org/r/165296786831.3591209.12111293034669289733.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/165305805651.4094995.7763502506786714216.stgit@warthog.procyon.org.uk # v2 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-06-10 04:46:04 +08:00
&cinode->netfs.inode);
} else if (oplock == OPLOCK_READ) {
cinode->oplock = CIFS_CACHE_READ_FLG;
cifs_dbg(FYI, "Level II Oplock granted on inode %p\n",
netfs: Fix gcc-12 warning by embedding vfs inode in netfs_i_context While randstruct was satisfied with using an open-coded "void *" offset cast for the netfs_i_context <-> inode casting, __builtin_object_size() as used by FORTIFY_SOURCE was not as easily fooled. This was causing the following complaint[1] from gcc v12: In file included from include/linux/string.h:253, from include/linux/ceph/ceph_debug.h:7, from fs/ceph/inode.c:2: In function 'fortify_memset_chk', inlined from 'netfs_i_context_init' at include/linux/netfs.h:326:2, inlined from 'ceph_alloc_inode' at fs/ceph/inode.c:463:2: include/linux/fortify-string.h:242:25: warning: call to '__write_overflow_field' declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Wattribute-warning] 242 | __write_overflow_field(p_size_field, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix this by embedding a struct inode into struct netfs_i_context (which should perhaps be renamed to struct netfs_inode). The struct inode vfs_inode fields are then removed from the 9p, afs, ceph and cifs inode structs and vfs_inode is then simply changed to "netfs.inode" in those filesystems. Further, rename netfs_i_context to netfs_inode, get rid of the netfs_inode() function that converted a netfs_i_context pointer to an inode pointer (that can now be done with &ctx->inode) and rename the netfs_i_context() function to netfs_inode() (which is now a wrapper around container_of()). Most of the changes were done with: perl -p -i -e 's/vfs_inode/netfs.inode/'g \ `git grep -l 'vfs_inode' -- fs/{9p,afs,ceph,cifs}/*.[ch]` Kees suggested doing it with a pair structure[2] and a special declarator to insert that into the network filesystem's inode wrapper[3], but I think it's cleaner to embed it - and then it doesn't matter if struct randomisation reorders things. Dave Chinner suggested using a filesystem-specific VFS_I() function in each filesystem to convert that filesystem's own inode wrapper struct into the VFS inode struct[4]. Version #2: - Fix a couple of missed name changes due to a disabled cifs option. - Rename nfs_i_context to nfs_inode - Use "netfs" instead of "nic" as the member name in per-fs inode wrapper structs. [ This also undoes commit 507160f46c55 ("netfs: gcc-12: temporarily disable '-Wattribute-warning' for now") that is no longer needed ] Fixes: bc899ee1c898 ("netfs: Add a netfs inode context") Reported-by: Jeff Layton <jlayton@kernel.org> Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> Reviewed-by: Kees Cook <keescook@chromium.org> Reviewed-by: Xiubo Li <xiubli@redhat.com> cc: Jonathan Corbet <corbet@lwn.net> cc: Eric Van Hensbergen <ericvh@gmail.com> cc: Latchesar Ionkov <lucho@ionkov.net> cc: Dominique Martinet <asmadeus@codewreck.org> cc: Christian Schoenebeck <linux_oss@crudebyte.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: Ilya Dryomov <idryomov@gmail.com> cc: Steve French <smfrench@gmail.com> cc: William Kucharski <william.kucharski@oracle.com> cc: "Matthew Wilcox (Oracle)" <willy@infradead.org> cc: Dave Chinner <david@fromorbit.com> cc: linux-doc@vger.kernel.org cc: v9fs-developer@lists.sourceforge.net cc: linux-afs@lists.infradead.org cc: ceph-devel@vger.kernel.org cc: linux-cifs@vger.kernel.org cc: samba-technical@lists.samba.org cc: linux-fsdevel@vger.kernel.org cc: linux-hardening@vger.kernel.org Link: https://lore.kernel.org/r/d2ad3a3d7bdd794c6efb562d2f2b655fb67756b9.camel@kernel.org/ [1] Link: https://lore.kernel.org/r/20220517210230.864239-1-keescook@chromium.org/ [2] Link: https://lore.kernel.org/r/20220518202212.2322058-1-keescook@chromium.org/ [3] Link: https://lore.kernel.org/r/20220524101205.GI2306852@dread.disaster.area/ [4] Link: https://lore.kernel.org/r/165296786831.3591209.12111293034669289733.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/165305805651.4094995.7763502506786714216.stgit@warthog.procyon.org.uk # v2 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-06-10 04:46:04 +08:00
&cinode->netfs.inode);
} else
cinode->oplock = 0;
}
cifs: Wait for writebacks to complete before attempting write. Problem reported in Red Hat bz 1040329 for strict writes where we cache only when we hold oplock and write direct to the server when we don't. When we receive an oplock break, we first change the oplock value for the inode in cifsInodeInfo->oplock to indicate that we no longer hold the oplock before we enqueue a task to flush changes to the backing device. Once we have completed flushing the changes, we return the oplock to the server. There are 2 ways here where we can have data corruption 1) While we flush changes to the backing device as part of the oplock break, we can have processes write to the file. These writes check for the oplock, find none and attempt to write directly to the server. These direct writes made while we are flushing from cache could be overwritten by data being flushed from the cache causing data corruption. 2) While a thread runs in cifs_strict_writev, the machine could receive and process an oplock break after the thread has checked the oplock and found that it allows us to cache and before we have made changes to the cache. In that case, we end up with a dirty page in cache when we shouldn't have any. This will be flushed later and will overwrite all subsequent writes to the part of the file represented by this page. Before making any writes to the server, we need to confirm that we are not in the process of flushing data to the server and if we are, we should wait until the process is complete before we attempt the write. We should also wait for existing writes to complete before we process an oplock break request which changes oplock values. We add a version specific downgrade_oplock() operation to allow for differences in the oplock values set for the different smb versions. Cc: stable@vger.kernel.org Signed-off-by: Sachin Prabhu <sprabhu@redhat.com> Reviewed-by: Jeff Layton <jlayton@redhat.com> Reviewed-by: Pavel Shilovsky <piastry@etersoft.ru> Signed-off-by: Steve French <smfrench@gmail.com>
2014-03-12 00:11:47 +08:00
/*
* We wait for oplock breaks to be processed before we attempt to perform
* writes.
*/
int cifs_get_writer(struct cifsInodeInfo *cinode)
{
int rc;
start:
rc = wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK,
sched: Remove proliferation of wait_on_bit() action functions The current "wait_on_bit" interface requires an 'action' function to be provided which does the actual waiting. There are over 20 such functions, many of them identical. Most cases can be satisfied by one of just two functions, one which uses io_schedule() and one which just uses schedule(). So: Rename wait_on_bit and wait_on_bit_lock to wait_on_bit_action and wait_on_bit_lock_action to make it explicit that they need an action function. Introduce new wait_on_bit{,_lock} and wait_on_bit{,_lock}_io which are *not* given an action function but implicitly use a standard one. The decision to error-out if a signal is pending is now made based on the 'mode' argument rather than being encoded in the action function. All instances of the old wait_on_bit and wait_on_bit_lock which can use the new version have been changed accordingly and their action functions have been discarded. wait_on_bit{_lock} does not return any specific error code in the event of a signal so the caller must check for non-zero and interpolate their own error code as appropriate. The wait_on_bit() call in __fscache_wait_on_invalidate() was ambiguous as it specified TASK_UNINTERRUPTIBLE but used fscache_wait_bit_interruptible as an action function. David Howells confirms this should be uniformly "uninterruptible" The main remaining user of wait_on_bit{,_lock}_action is NFS which needs to use a freezer-aware schedule() call. A comment in fs/gfs2/glock.c notes that having multiple 'action' functions is useful as they display differently in the 'wchan' field of 'ps'. (and /proc/$PID/wchan). As the new bit_wait{,_io} functions are tagged "__sched", they will not show up at all, but something higher in the stack. So the distinction will still be visible, only with different function names (gds2_glock_wait versus gfs2_glock_dq_wait in the gfs2/glock.c case). Since first version of this patch (against 3.15) two new action functions appeared, on in NFS and one in CIFS. CIFS also now uses an action function that makes the same freezer aware schedule call as NFS. Signed-off-by: NeilBrown <neilb@suse.de> Acked-by: David Howells <dhowells@redhat.com> (fscache, keys) Acked-by: Steven Whitehouse <swhiteho@redhat.com> (gfs2) Acked-by: Peter Zijlstra <peterz@infradead.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Steve French <sfrench@samba.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/20140707051603.28027.72349.stgit@notabene.brown Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-07-07 13:16:04 +08:00
TASK_KILLABLE);
cifs: Wait for writebacks to complete before attempting write. Problem reported in Red Hat bz 1040329 for strict writes where we cache only when we hold oplock and write direct to the server when we don't. When we receive an oplock break, we first change the oplock value for the inode in cifsInodeInfo->oplock to indicate that we no longer hold the oplock before we enqueue a task to flush changes to the backing device. Once we have completed flushing the changes, we return the oplock to the server. There are 2 ways here where we can have data corruption 1) While we flush changes to the backing device as part of the oplock break, we can have processes write to the file. These writes check for the oplock, find none and attempt to write directly to the server. These direct writes made while we are flushing from cache could be overwritten by data being flushed from the cache causing data corruption. 2) While a thread runs in cifs_strict_writev, the machine could receive and process an oplock break after the thread has checked the oplock and found that it allows us to cache and before we have made changes to the cache. In that case, we end up with a dirty page in cache when we shouldn't have any. This will be flushed later and will overwrite all subsequent writes to the part of the file represented by this page. Before making any writes to the server, we need to confirm that we are not in the process of flushing data to the server and if we are, we should wait until the process is complete before we attempt the write. We should also wait for existing writes to complete before we process an oplock break request which changes oplock values. We add a version specific downgrade_oplock() operation to allow for differences in the oplock values set for the different smb versions. Cc: stable@vger.kernel.org Signed-off-by: Sachin Prabhu <sprabhu@redhat.com> Reviewed-by: Jeff Layton <jlayton@redhat.com> Reviewed-by: Pavel Shilovsky <piastry@etersoft.ru> Signed-off-by: Steve French <smfrench@gmail.com>
2014-03-12 00:11:47 +08:00
if (rc)
return rc;
spin_lock(&cinode->writers_lock);
if (!cinode->writers)
set_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags);
cinode->writers++;
/* Check to see if we have started servicing an oplock break */
if (test_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags)) {
cinode->writers--;
if (cinode->writers == 0) {
clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags);
wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS);
}
spin_unlock(&cinode->writers_lock);
goto start;
}
spin_unlock(&cinode->writers_lock);
return 0;
}
void cifs_put_writer(struct cifsInodeInfo *cinode)
{
spin_lock(&cinode->writers_lock);
cinode->writers--;
if (cinode->writers == 0) {
clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags);
wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS);
}
spin_unlock(&cinode->writers_lock);
}
/**
* cifs_queue_oplock_break - queue the oplock break handler for cfile
* @cfile: The file to break the oplock on
*
* This function is called from the demultiplex thread when it
* receives an oplock break for @cfile.
*
* Assumes the tcon->open_file_lock is held.
* Assumes cfile->file_info_lock is NOT held.
*/
void cifs_queue_oplock_break(struct cifsFileInfo *cfile)
{
/*
* Bump the handle refcount now while we hold the
* open_file_lock to enforce the validity of it for the oplock
* break handler. The matching put is done at the end of the
* handler.
*/
cifsFileInfo_get(cfile);
queue_work(cifsoplockd_wq, &cfile->oplock_break);
}
cifs: Wait for writebacks to complete before attempting write. Problem reported in Red Hat bz 1040329 for strict writes where we cache only when we hold oplock and write direct to the server when we don't. When we receive an oplock break, we first change the oplock value for the inode in cifsInodeInfo->oplock to indicate that we no longer hold the oplock before we enqueue a task to flush changes to the backing device. Once we have completed flushing the changes, we return the oplock to the server. There are 2 ways here where we can have data corruption 1) While we flush changes to the backing device as part of the oplock break, we can have processes write to the file. These writes check for the oplock, find none and attempt to write directly to the server. These direct writes made while we are flushing from cache could be overwritten by data being flushed from the cache causing data corruption. 2) While a thread runs in cifs_strict_writev, the machine could receive and process an oplock break after the thread has checked the oplock and found that it allows us to cache and before we have made changes to the cache. In that case, we end up with a dirty page in cache when we shouldn't have any. This will be flushed later and will overwrite all subsequent writes to the part of the file represented by this page. Before making any writes to the server, we need to confirm that we are not in the process of flushing data to the server and if we are, we should wait until the process is complete before we attempt the write. We should also wait for existing writes to complete before we process an oplock break request which changes oplock values. We add a version specific downgrade_oplock() operation to allow for differences in the oplock values set for the different smb versions. Cc: stable@vger.kernel.org Signed-off-by: Sachin Prabhu <sprabhu@redhat.com> Reviewed-by: Jeff Layton <jlayton@redhat.com> Reviewed-by: Pavel Shilovsky <piastry@etersoft.ru> Signed-off-by: Steve French <smfrench@gmail.com>
2014-03-12 00:11:47 +08:00
void cifs_done_oplock_break(struct cifsInodeInfo *cinode)
{
clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags);
wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK);
}
bool
backup_cred(struct cifs_sb_info *cifs_sb)
{
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) {
if (uid_eq(cifs_sb->ctx->backupuid, current_fsuid()))
return true;
}
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) {
if (in_group_p(cifs_sb->ctx->backupgid))
return true;
}
return false;
}
void
cifs_del_pending_open(struct cifs_pending_open *open)
{
spin_lock(&tlink_tcon(open->tlink)->open_file_lock);
list_del(&open->olist);
spin_unlock(&tlink_tcon(open->tlink)->open_file_lock);
}
void
cifs_add_pending_open_locked(struct cifs_fid *fid, struct tcon_link *tlink,
struct cifs_pending_open *open)
{
memcpy(open->lease_key, fid->lease_key, SMB2_LEASE_KEY_SIZE);
open->oplock = CIFS_OPLOCK_NO_CHANGE;
open->tlink = tlink;
fid->pending_open = open;
list_add_tail(&open->olist, &tlink_tcon(tlink)->pending_opens);
}
void
cifs_add_pending_open(struct cifs_fid *fid, struct tcon_link *tlink,
struct cifs_pending_open *open)
{
spin_lock(&tlink_tcon(tlink)->open_file_lock);
cifs_add_pending_open_locked(fid, tlink, open);
spin_unlock(&tlink_tcon(open->tlink)->open_file_lock);
}
/*
* Critical section which runs after acquiring deferred_lock.
Fix KASAN identified use-after-free issue. [ 612.157429] ================================================================== [ 612.158275] BUG: KASAN: use-after-free in process_one_work+0x90/0x9b0 [ 612.158801] Read of size 8 at addr ffff88810a31ca60 by task kworker/2:9/2382 [ 612.159611] CPU: 2 PID: 2382 Comm: kworker/2:9 Tainted: G OE 5.13.0-rc2+ #98 [ 612.159623] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc33 04/01/2014 [ 612.159640] Workqueue: 0x0 (deferredclose) [ 612.159669] Call Trace: [ 612.159685] dump_stack+0xbb/0x107 [ 612.159711] print_address_description.constprop.0+0x18/0x140 [ 612.159733] ? process_one_work+0x90/0x9b0 [ 612.159743] ? process_one_work+0x90/0x9b0 [ 612.159754] kasan_report.cold+0x7c/0xd8 [ 612.159778] ? lock_is_held_type+0x80/0x130 [ 612.159789] ? process_one_work+0x90/0x9b0 [ 612.159812] kasan_check_range+0x145/0x1a0 [ 612.159834] process_one_work+0x90/0x9b0 [ 612.159877] ? pwq_dec_nr_in_flight+0x110/0x110 [ 612.159914] ? spin_bug+0x90/0x90 [ 612.159967] worker_thread+0x3b6/0x6c0 [ 612.160023] ? process_one_work+0x9b0/0x9b0 [ 612.160038] kthread+0x1dc/0x200 [ 612.160051] ? kthread_create_worker_on_cpu+0xd0/0xd0 [ 612.160092] ret_from_fork+0x1f/0x30 [ 612.160399] Allocated by task 2358: [ 612.160757] kasan_save_stack+0x1b/0x40 [ 612.160768] __kasan_kmalloc+0x9b/0xd0 [ 612.160778] cifs_new_fileinfo+0xb0/0x960 [cifs] [ 612.161170] cifs_open+0xadf/0xf20 [cifs] [ 612.161421] do_dentry_open+0x2aa/0x6b0 [ 612.161432] path_openat+0xbd9/0xfa0 [ 612.161441] do_filp_open+0x11d/0x230 [ 612.161450] do_sys_openat2+0x115/0x240 [ 612.161460] __x64_sys_openat+0xce/0x140 When mod_delayed_work is called to modify the delay of pending work, it might return false and queue a new work when pending work is already scheduled or when try to grab pending work failed. So, Increase the reference count when new work is scheduled to avoid use-after-free. Signed-off-by: Rohith Surabattula <rohiths@microsoft.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2021-05-21 00:45:01 +08:00
* As there is no reference count on cifs_deferred_close, pdclose
* should not be used outside deferred_lock.
*/
bool
cifs_is_deferred_close(struct cifsFileInfo *cfile, struct cifs_deferred_close **pdclose)
{
struct cifs_deferred_close *dclose;
list_for_each_entry(dclose, &CIFS_I(d_inode(cfile->dentry))->deferred_closes, dlist) {
if ((dclose->netfid == cfile->fid.netfid) &&
(dclose->persistent_fid == cfile->fid.persistent_fid) &&
(dclose->volatile_fid == cfile->fid.volatile_fid)) {
*pdclose = dclose;
return true;
}
}
return false;
}
/*
* Critical section which runs after acquiring deferred_lock.
*/
void
cifs_add_deferred_close(struct cifsFileInfo *cfile, struct cifs_deferred_close *dclose)
{
bool is_deferred = false;
struct cifs_deferred_close *pdclose;
is_deferred = cifs_is_deferred_close(cfile, &pdclose);
if (is_deferred) {
kfree(dclose);
return;
}
dclose->tlink = cfile->tlink;
dclose->netfid = cfile->fid.netfid;
dclose->persistent_fid = cfile->fid.persistent_fid;
dclose->volatile_fid = cfile->fid.volatile_fid;
list_add_tail(&dclose->dlist, &CIFS_I(d_inode(cfile->dentry))->deferred_closes);
}
/*
* Critical section which runs after acquiring deferred_lock.
*/
void
cifs_del_deferred_close(struct cifsFileInfo *cfile)
{
bool is_deferred = false;
struct cifs_deferred_close *dclose;
is_deferred = cifs_is_deferred_close(cfile, &dclose);
if (!is_deferred)
return;
list_del(&dclose->dlist);
kfree(dclose);
}
void
cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode)
{
struct cifsFileInfo *cfile = NULL;
struct file_list *tmp_list, *tmp_next_list;
struct list_head file_head;
if (cifs_inode == NULL)
return;
INIT_LIST_HEAD(&file_head);
spin_lock(&cifs_inode->open_file_lock);
list_for_each_entry(cfile, &cifs_inode->openFileList, flist) {
if (delayed_work_pending(&cfile->deferred)) {
if (cancel_delayed_work(&cfile->deferred)) {
spin_lock(&cifs_inode->deferred_lock);
cifs_del_deferred_close(cfile);
spin_unlock(&cifs_inode->deferred_lock);
tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
if (tmp_list == NULL)
break;
tmp_list->cfile = cfile;
list_add_tail(&tmp_list->list, &file_head);
}
}
}
spin_unlock(&cifs_inode->open_file_lock);
list_for_each_entry_safe(tmp_list, tmp_next_list, &file_head, list) {
_cifsFileInfo_put(tmp_list->cfile, false, false);
list_del(&tmp_list->list);
kfree(tmp_list);
}
}
void
cifs_close_all_deferred_files(struct cifs_tcon *tcon)
{
struct cifsFileInfo *cfile;
struct file_list *tmp_list, *tmp_next_list;
struct list_head file_head;
INIT_LIST_HEAD(&file_head);
spin_lock(&tcon->open_file_lock);
list_for_each_entry(cfile, &tcon->openFileList, tlist) {
Fix KASAN identified use-after-free issue. [ 612.157429] ================================================================== [ 612.158275] BUG: KASAN: use-after-free in process_one_work+0x90/0x9b0 [ 612.158801] Read of size 8 at addr ffff88810a31ca60 by task kworker/2:9/2382 [ 612.159611] CPU: 2 PID: 2382 Comm: kworker/2:9 Tainted: G OE 5.13.0-rc2+ #98 [ 612.159623] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc33 04/01/2014 [ 612.159640] Workqueue: 0x0 (deferredclose) [ 612.159669] Call Trace: [ 612.159685] dump_stack+0xbb/0x107 [ 612.159711] print_address_description.constprop.0+0x18/0x140 [ 612.159733] ? process_one_work+0x90/0x9b0 [ 612.159743] ? process_one_work+0x90/0x9b0 [ 612.159754] kasan_report.cold+0x7c/0xd8 [ 612.159778] ? lock_is_held_type+0x80/0x130 [ 612.159789] ? process_one_work+0x90/0x9b0 [ 612.159812] kasan_check_range+0x145/0x1a0 [ 612.159834] process_one_work+0x90/0x9b0 [ 612.159877] ? pwq_dec_nr_in_flight+0x110/0x110 [ 612.159914] ? spin_bug+0x90/0x90 [ 612.159967] worker_thread+0x3b6/0x6c0 [ 612.160023] ? process_one_work+0x9b0/0x9b0 [ 612.160038] kthread+0x1dc/0x200 [ 612.160051] ? kthread_create_worker_on_cpu+0xd0/0xd0 [ 612.160092] ret_from_fork+0x1f/0x30 [ 612.160399] Allocated by task 2358: [ 612.160757] kasan_save_stack+0x1b/0x40 [ 612.160768] __kasan_kmalloc+0x9b/0xd0 [ 612.160778] cifs_new_fileinfo+0xb0/0x960 [cifs] [ 612.161170] cifs_open+0xadf/0xf20 [cifs] [ 612.161421] do_dentry_open+0x2aa/0x6b0 [ 612.161432] path_openat+0xbd9/0xfa0 [ 612.161441] do_filp_open+0x11d/0x230 [ 612.161450] do_sys_openat2+0x115/0x240 [ 612.161460] __x64_sys_openat+0xce/0x140 When mod_delayed_work is called to modify the delay of pending work, it might return false and queue a new work when pending work is already scheduled or when try to grab pending work failed. So, Increase the reference count when new work is scheduled to avoid use-after-free. Signed-off-by: Rohith Surabattula <rohiths@microsoft.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2021-05-21 00:45:01 +08:00
if (delayed_work_pending(&cfile->deferred)) {
if (cancel_delayed_work(&cfile->deferred)) {
spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
cifs_del_deferred_close(cfile);
spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
if (tmp_list == NULL)
break;
tmp_list->cfile = cfile;
list_add_tail(&tmp_list->list, &file_head);
}
Fix KASAN identified use-after-free issue. [ 612.157429] ================================================================== [ 612.158275] BUG: KASAN: use-after-free in process_one_work+0x90/0x9b0 [ 612.158801] Read of size 8 at addr ffff88810a31ca60 by task kworker/2:9/2382 [ 612.159611] CPU: 2 PID: 2382 Comm: kworker/2:9 Tainted: G OE 5.13.0-rc2+ #98 [ 612.159623] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc33 04/01/2014 [ 612.159640] Workqueue: 0x0 (deferredclose) [ 612.159669] Call Trace: [ 612.159685] dump_stack+0xbb/0x107 [ 612.159711] print_address_description.constprop.0+0x18/0x140 [ 612.159733] ? process_one_work+0x90/0x9b0 [ 612.159743] ? process_one_work+0x90/0x9b0 [ 612.159754] kasan_report.cold+0x7c/0xd8 [ 612.159778] ? lock_is_held_type+0x80/0x130 [ 612.159789] ? process_one_work+0x90/0x9b0 [ 612.159812] kasan_check_range+0x145/0x1a0 [ 612.159834] process_one_work+0x90/0x9b0 [ 612.159877] ? pwq_dec_nr_in_flight+0x110/0x110 [ 612.159914] ? spin_bug+0x90/0x90 [ 612.159967] worker_thread+0x3b6/0x6c0 [ 612.160023] ? process_one_work+0x9b0/0x9b0 [ 612.160038] kthread+0x1dc/0x200 [ 612.160051] ? kthread_create_worker_on_cpu+0xd0/0xd0 [ 612.160092] ret_from_fork+0x1f/0x30 [ 612.160399] Allocated by task 2358: [ 612.160757] kasan_save_stack+0x1b/0x40 [ 612.160768] __kasan_kmalloc+0x9b/0xd0 [ 612.160778] cifs_new_fileinfo+0xb0/0x960 [cifs] [ 612.161170] cifs_open+0xadf/0xf20 [cifs] [ 612.161421] do_dentry_open+0x2aa/0x6b0 [ 612.161432] path_openat+0xbd9/0xfa0 [ 612.161441] do_filp_open+0x11d/0x230 [ 612.161450] do_sys_openat2+0x115/0x240 [ 612.161460] __x64_sys_openat+0xce/0x140 When mod_delayed_work is called to modify the delay of pending work, it might return false and queue a new work when pending work is already scheduled or when try to grab pending work failed. So, Increase the reference count when new work is scheduled to avoid use-after-free. Signed-off-by: Rohith Surabattula <rohiths@microsoft.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2021-05-21 00:45:01 +08:00
}
}
spin_unlock(&tcon->open_file_lock);
list_for_each_entry_safe(tmp_list, tmp_next_list, &file_head, list) {
_cifsFileInfo_put(tmp_list->cfile, true, false);
list_del(&tmp_list->list);
kfree(tmp_list);
}
}
void
cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path)
{
struct cifsFileInfo *cfile;
struct file_list *tmp_list, *tmp_next_list;
struct list_head file_head;
void *page;
const char *full_path;
INIT_LIST_HEAD(&file_head);
page = alloc_dentry_path();
spin_lock(&tcon->open_file_lock);
list_for_each_entry(cfile, &tcon->openFileList, tlist) {
full_path = build_path_from_dentry(cfile->dentry, page);
if (strstr(full_path, path)) {
if (delayed_work_pending(&cfile->deferred)) {
if (cancel_delayed_work(&cfile->deferred)) {
spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
cifs_del_deferred_close(cfile);
spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
if (tmp_list == NULL)
break;
tmp_list->cfile = cfile;
list_add_tail(&tmp_list->list, &file_head);
}
}
}
}
spin_unlock(&tcon->open_file_lock);
list_for_each_entry_safe(tmp_list, tmp_next_list, &file_head, list) {
_cifsFileInfo_put(tmp_list->cfile, true, false);
list_del(&tmp_list->list);
kfree(tmp_list);
}
free_dentry_path(page);
}
/* parses DFS referral V3 structure
* caller is responsible for freeing target_nodes
* returns:
* - on success - 0
* - on failure - errno
*/
int
parse_dfs_referrals(struct get_dfs_referral_rsp *rsp, u32 rsp_size,
unsigned int *num_of_nodes,
struct dfs_info3_param **target_nodes,
const struct nls_table *nls_codepage, int remap,
const char *searchName, bool is_unicode)
{
int i, rc = 0;
char *data_end;
struct dfs_referral_level_3 *ref;
*num_of_nodes = le16_to_cpu(rsp->NumberOfReferrals);
if (*num_of_nodes < 1) {
cifs_dbg(VFS, "num_referrals: must be at least > 0, but we get num_referrals = %d\n",
*num_of_nodes);
rc = -EINVAL;
goto parse_DFS_referrals_exit;
}
ref = (struct dfs_referral_level_3 *) &(rsp->referrals);
if (ref->VersionNumber != cpu_to_le16(3)) {
cifs_dbg(VFS, "Referrals of V%d version are not supported, should be V3\n",
le16_to_cpu(ref->VersionNumber));
rc = -EINVAL;
goto parse_DFS_referrals_exit;
}
/* get the upper boundary of the resp buffer */
data_end = (char *)rsp + rsp_size;
cifs_dbg(FYI, "num_referrals: %d dfs flags: 0x%x ...\n",
*num_of_nodes, le32_to_cpu(rsp->DFSFlags));
*target_nodes = kcalloc(*num_of_nodes, sizeof(struct dfs_info3_param),
GFP_KERNEL);
if (*target_nodes == NULL) {
rc = -ENOMEM;
goto parse_DFS_referrals_exit;
}
/* collect necessary data from referrals */
for (i = 0; i < *num_of_nodes; i++) {
char *temp;
int max_len;
struct dfs_info3_param *node = (*target_nodes)+i;
node->flags = le32_to_cpu(rsp->DFSFlags);
if (is_unicode) {
__le16 *tmp = kmalloc(strlen(searchName)*2 + 2,
GFP_KERNEL);
if (tmp == NULL) {
rc = -ENOMEM;
goto parse_DFS_referrals_exit;
}
cifsConvertToUTF16((__le16 *) tmp, searchName,
PATH_MAX, nls_codepage, remap);
node->path_consumed = cifs_utf16_bytes(tmp,
le16_to_cpu(rsp->PathConsumed),
nls_codepage);
kfree(tmp);
} else
node->path_consumed = le16_to_cpu(rsp->PathConsumed);
node->server_type = le16_to_cpu(ref->ServerType);
node->ref_flag = le16_to_cpu(ref->ReferralEntryFlags);
/* copy DfsPath */
temp = (char *)ref + le16_to_cpu(ref->DfsPathOffset);
max_len = data_end - temp;
node->path_name = cifs_strndup_from_utf16(temp, max_len,
is_unicode, nls_codepage);
if (!node->path_name) {
rc = -ENOMEM;
goto parse_DFS_referrals_exit;
}
/* copy link target UNC */
temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset);
max_len = data_end - temp;
node->node_name = cifs_strndup_from_utf16(temp, max_len,
is_unicode, nls_codepage);
if (!node->node_name) {
rc = -ENOMEM;
goto parse_DFS_referrals_exit;
}
node->ttl = le32_to_cpu(ref->TimeToLive);
ref++;
}
parse_DFS_referrals_exit:
if (rc) {
free_dfs_info_array(*target_nodes, *num_of_nodes);
*target_nodes = NULL;
*num_of_nodes = 0;
}
return rc;
}
struct cifs_aio_ctx *
cifs_aio_ctx_alloc(void)
{
struct cifs_aio_ctx *ctx;
/*
* Must use kzalloc to initialize ctx->bv to NULL and ctx->direct_io
* to false so that we know when we have to unreference pages within
* cifs_aio_ctx_release()
*/
ctx = kzalloc(sizeof(struct cifs_aio_ctx), GFP_KERNEL);
if (!ctx)
return NULL;
INIT_LIST_HEAD(&ctx->list);
mutex_init(&ctx->aio_mutex);
init_completion(&ctx->done);
kref_init(&ctx->refcount);
return ctx;
}
void
cifs_aio_ctx_release(struct kref *refcount)
{
struct cifs_aio_ctx *ctx = container_of(refcount,
struct cifs_aio_ctx, refcount);
cifsFileInfo_put(ctx->cfile);
/*
* ctx->bv is only set if setup_aio_ctx_iter() was call successfuly
cifs: Change the I/O paths to use an iterator rather than a page list Currently, the cifs I/O paths hand lists of pages from the VM interface routines at the top all the way through the intervening layers to the socket interface at the bottom. This is a problem, however, for interfacing with netfslib which passes an iterator through to the ->issue_read() method (and will pass an iterator through to the ->issue_write() method in future). Netfslib takes over bounce buffering for direct I/O, async I/O and encrypted content, so cifs doesn't need to do that. Netfslib also converts IOVEC-type iterators into BVEC-type iterators if necessary. Further, cifs needs foliating - and folios may come in a variety of sizes, so a page list pointing to an array of heterogeneous pages may cause problems in places such as where crypto is done. Change the cifs I/O paths to hand iov_iter iterators all the way through instead. Notes: (1) Some old routines are #if'd out to be removed in a follow up patch so as to avoid confusing diff, thereby making the diff output easier to follow. I've removed functions that don't overlap with anything added. (2) struct smb_rqst loses rq_pages, rq_offset, rq_npages, rq_pagesz and rq_tailsz which describe the pages forming the buffer; instead there's an rq_iter describing the source buffer and an rq_buffer which is used to hold the buffer for encryption. (3) struct cifs_readdata and cifs_writedata are similarly modified to smb_rqst. The ->read_into_pages() and ->copy_into_pages() are then replaced with passing the iterator directly to the socket. The iterators are stored in these structs so that they are persistent and don't get deallocated when the function returns (unlike if they were stack variables). (4) Buffered writeback is overhauled, borrowing the code from the afs filesystem to gather up contiguous runs of folios. The XARRAY-type iterator is then used to refer directly to the pagecache and can be passed to the socket to transmit data directly from there. This includes: cifs_extend_writeback() cifs_write_back_from_locked_folio() cifs_writepages_region() cifs_writepages() (5) Pages are converted to folios. (6) Direct I/O uses netfs_extract_user_iter() to create a BVEC-type iterator from an IOBUF/UBUF-type source iterator. (7) smb2_get_aead_req() uses netfs_extract_iter_to_sg() to extract page fragments from the iterator into the scatterlists that the crypto layer prefers. (8) smb2_init_transform_rq() attached pages to smb_rqst::rq_buffer, an xarray, to use as a bounce buffer for encryption. An XARRAY-type iterator can then be used to pass the bounce buffer to lower layers. Signed-off-by: David Howells <dhowells@redhat.com> cc: Steve French <sfrench@samba.org> cc: Shyam Prasad N <nspmangalore@gmail.com> cc: Rohith Surabattula <rohiths.msft@gmail.com> cc: Paulo Alcantara <pc@cjr.nz> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cifs@vger.kernel.org Link: https://lore.kernel.org/r/164311907995.2806745.400147335497304099.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/164928620163.457102.11602306234438271112.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/165211420279.3154751.15923591172438186144.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/165348880385.2106726.3220789453472800240.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/165364827111.3334034.934805882842932881.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/166126396180.708021.271013668175370826.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/166697259595.61150.5982032408321852414.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/166732031756.3186319.12528413619888902872.stgit@warthog.procyon.org.uk/ # rfc Signed-off-by: Steve French <stfrench@microsoft.com>
2022-01-25 05:13:24 +08:00
* which means that iov_iter_extract_pages() was a success and thus
* that we may have references or pins on pages that we need to
* release.
*/
if (ctx->bv) {
cifs: Change the I/O paths to use an iterator rather than a page list Currently, the cifs I/O paths hand lists of pages from the VM interface routines at the top all the way through the intervening layers to the socket interface at the bottom. This is a problem, however, for interfacing with netfslib which passes an iterator through to the ->issue_read() method (and will pass an iterator through to the ->issue_write() method in future). Netfslib takes over bounce buffering for direct I/O, async I/O and encrypted content, so cifs doesn't need to do that. Netfslib also converts IOVEC-type iterators into BVEC-type iterators if necessary. Further, cifs needs foliating - and folios may come in a variety of sizes, so a page list pointing to an array of heterogeneous pages may cause problems in places such as where crypto is done. Change the cifs I/O paths to hand iov_iter iterators all the way through instead. Notes: (1) Some old routines are #if'd out to be removed in a follow up patch so as to avoid confusing diff, thereby making the diff output easier to follow. I've removed functions that don't overlap with anything added. (2) struct smb_rqst loses rq_pages, rq_offset, rq_npages, rq_pagesz and rq_tailsz which describe the pages forming the buffer; instead there's an rq_iter describing the source buffer and an rq_buffer which is used to hold the buffer for encryption. (3) struct cifs_readdata and cifs_writedata are similarly modified to smb_rqst. The ->read_into_pages() and ->copy_into_pages() are then replaced with passing the iterator directly to the socket. The iterators are stored in these structs so that they are persistent and don't get deallocated when the function returns (unlike if they were stack variables). (4) Buffered writeback is overhauled, borrowing the code from the afs filesystem to gather up contiguous runs of folios. The XARRAY-type iterator is then used to refer directly to the pagecache and can be passed to the socket to transmit data directly from there. This includes: cifs_extend_writeback() cifs_write_back_from_locked_folio() cifs_writepages_region() cifs_writepages() (5) Pages are converted to folios. (6) Direct I/O uses netfs_extract_user_iter() to create a BVEC-type iterator from an IOBUF/UBUF-type source iterator. (7) smb2_get_aead_req() uses netfs_extract_iter_to_sg() to extract page fragments from the iterator into the scatterlists that the crypto layer prefers. (8) smb2_init_transform_rq() attached pages to smb_rqst::rq_buffer, an xarray, to use as a bounce buffer for encryption. An XARRAY-type iterator can then be used to pass the bounce buffer to lower layers. Signed-off-by: David Howells <dhowells@redhat.com> cc: Steve French <sfrench@samba.org> cc: Shyam Prasad N <nspmangalore@gmail.com> cc: Rohith Surabattula <rohiths.msft@gmail.com> cc: Paulo Alcantara <pc@cjr.nz> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cifs@vger.kernel.org Link: https://lore.kernel.org/r/164311907995.2806745.400147335497304099.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/164928620163.457102.11602306234438271112.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/165211420279.3154751.15923591172438186144.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/165348880385.2106726.3220789453472800240.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/165364827111.3334034.934805882842932881.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/166126396180.708021.271013668175370826.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/166697259595.61150.5982032408321852414.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/166732031756.3186319.12528413619888902872.stgit@warthog.procyon.org.uk/ # rfc Signed-off-by: Steve French <stfrench@microsoft.com>
2022-01-25 05:13:24 +08:00
if (ctx->should_dirty || ctx->bv_need_unpin) {
unsigned int i;
cifs: Change the I/O paths to use an iterator rather than a page list Currently, the cifs I/O paths hand lists of pages from the VM interface routines at the top all the way through the intervening layers to the socket interface at the bottom. This is a problem, however, for interfacing with netfslib which passes an iterator through to the ->issue_read() method (and will pass an iterator through to the ->issue_write() method in future). Netfslib takes over bounce buffering for direct I/O, async I/O and encrypted content, so cifs doesn't need to do that. Netfslib also converts IOVEC-type iterators into BVEC-type iterators if necessary. Further, cifs needs foliating - and folios may come in a variety of sizes, so a page list pointing to an array of heterogeneous pages may cause problems in places such as where crypto is done. Change the cifs I/O paths to hand iov_iter iterators all the way through instead. Notes: (1) Some old routines are #if'd out to be removed in a follow up patch so as to avoid confusing diff, thereby making the diff output easier to follow. I've removed functions that don't overlap with anything added. (2) struct smb_rqst loses rq_pages, rq_offset, rq_npages, rq_pagesz and rq_tailsz which describe the pages forming the buffer; instead there's an rq_iter describing the source buffer and an rq_buffer which is used to hold the buffer for encryption. (3) struct cifs_readdata and cifs_writedata are similarly modified to smb_rqst. The ->read_into_pages() and ->copy_into_pages() are then replaced with passing the iterator directly to the socket. The iterators are stored in these structs so that they are persistent and don't get deallocated when the function returns (unlike if they were stack variables). (4) Buffered writeback is overhauled, borrowing the code from the afs filesystem to gather up contiguous runs of folios. The XARRAY-type iterator is then used to refer directly to the pagecache and can be passed to the socket to transmit data directly from there. This includes: cifs_extend_writeback() cifs_write_back_from_locked_folio() cifs_writepages_region() cifs_writepages() (5) Pages are converted to folios. (6) Direct I/O uses netfs_extract_user_iter() to create a BVEC-type iterator from an IOBUF/UBUF-type source iterator. (7) smb2_get_aead_req() uses netfs_extract_iter_to_sg() to extract page fragments from the iterator into the scatterlists that the crypto layer prefers. (8) smb2_init_transform_rq() attached pages to smb_rqst::rq_buffer, an xarray, to use as a bounce buffer for encryption. An XARRAY-type iterator can then be used to pass the bounce buffer to lower layers. Signed-off-by: David Howells <dhowells@redhat.com> cc: Steve French <sfrench@samba.org> cc: Shyam Prasad N <nspmangalore@gmail.com> cc: Rohith Surabattula <rohiths.msft@gmail.com> cc: Paulo Alcantara <pc@cjr.nz> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cifs@vger.kernel.org Link: https://lore.kernel.org/r/164311907995.2806745.400147335497304099.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/164928620163.457102.11602306234438271112.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/165211420279.3154751.15923591172438186144.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/165348880385.2106726.3220789453472800240.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/165364827111.3334034.934805882842932881.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/166126396180.708021.271013668175370826.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/166697259595.61150.5982032408321852414.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/166732031756.3186319.12528413619888902872.stgit@warthog.procyon.org.uk/ # rfc Signed-off-by: Steve French <stfrench@microsoft.com>
2022-01-25 05:13:24 +08:00
for (i = 0; i < ctx->nr_pinned_pages; i++) {
struct page *page = ctx->bv[i].bv_page;
if (ctx->should_dirty)
set_page_dirty(page);
if (ctx->bv_need_unpin)
unpin_user_page(page);
}
}
kvfree(ctx->bv);
}
kfree(ctx);
}
/**
* cifs_alloc_hash - allocate hash and hash context together
* @name: The name of the crypto hash algo
* @sdesc: SHASH descriptor where to put the pointer to the hash TFM
*
* The caller has to make sure @sdesc is initialized to either NULL or
* a valid context. It can be freed via cifs_free_hash().
*/
int
cifs_alloc_hash(const char *name, struct shash_desc **sdesc)
{
int rc = 0;
struct crypto_shash *alg = NULL;
if (*sdesc)
return 0;
alg = crypto_alloc_shash(name, 0, 0);
if (IS_ERR(alg)) {
cifs_dbg(VFS, "Could not allocate shash TFM '%s'\n", name);
rc = PTR_ERR(alg);
*sdesc = NULL;
return rc;
}
*sdesc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(alg), GFP_KERNEL);
if (*sdesc == NULL) {
cifs_dbg(VFS, "no memory left to allocate shash TFM '%s'\n", name);
crypto_free_shash(alg);
return -ENOMEM;
}
(*sdesc)->tfm = alg;
return 0;
}
/**
* cifs_free_hash - free hash and hash context together
* @sdesc: Where to find the pointer to the hash TFM
*
* Freeing a NULL descriptor is safe.
*/
void
cifs_free_hash(struct shash_desc **sdesc)
{
if (unlikely(!sdesc) || !*sdesc)
return;
if ((*sdesc)->tfm) {
crypto_free_shash((*sdesc)->tfm);
(*sdesc)->tfm = NULL;
}
kfree_sensitive(*sdesc);
*sdesc = NULL;
}
void extract_unc_hostname(const char *unc, const char **h, size_t *len)
{
const char *end;
/* skip initial slashes */
while (*unc && (*unc == '\\' || *unc == '/'))
unc++;
end = unc;
while (*end && !(*end == '\\' || *end == '/'))
end++;
*h = unc;
*len = end - unc;
}
/**
* copy_path_name - copy src path to dst, possibly truncating
* @dst: The destination buffer
* @src: The source name
*
* returns number of bytes written (including trailing nul)
*/
int copy_path_name(char *dst, const char *src)
{
int name_len;
/*
* PATH_MAX includes nul, so if strlen(src) >= PATH_MAX it
* will truncate and strlen(dst) will be PATH_MAX-1
*/
name_len = strscpy(dst, src, PATH_MAX);
if (WARN_ON_ONCE(name_len < 0))
name_len = PATH_MAX-1;
/* we count the trailing nul */
name_len++;
return name_len;
}
struct super_cb_data {
void *data;
struct super_block *sb;
};
static void tcon_super_cb(struct super_block *sb, void *arg)
{
struct super_cb_data *sd = arg;
struct cifs_sb_info *cifs_sb;
struct cifs_tcon *t1 = sd->data, *t2;
if (sd->sb)
return;
cifs_sb = CIFS_SB(sb);
t2 = cifs_sb_master_tcon(cifs_sb);
spin_lock(&t2->tc_lock);
if (t1->ses == t2->ses &&
t1->ses->server == t2->ses->server &&
t2->origin_fullpath &&
dfs_src_pathname_equal(t2->origin_fullpath, t1->origin_fullpath))
sd->sb = sb;
spin_unlock(&t2->tc_lock);
}
static struct super_block *__cifs_get_super(void (*f)(struct super_block *, void *),
void *data)
{
struct super_cb_data sd = {
.data = data,
.sb = NULL,
};
struct file_system_type **fs_type = (struct file_system_type *[]) {
&cifs_fs_type, &smb3_fs_type, NULL,
};
for (; *fs_type; fs_type++) {
iterate_supers_type(*fs_type, f, &sd);
if (sd.sb) {
/*
* Grab an active reference in order to prevent automounts (DFS links)
* of expiring and then freeing up our cifs superblock pointer while
* we're doing failover.
*/
cifs_sb_active(sd.sb);
return sd.sb;
}
}
pr_warn_once("%s: could not find dfs superblock\n", __func__);
return ERR_PTR(-EINVAL);
}
static void __cifs_put_super(struct super_block *sb)
{
if (!IS_ERR_OR_NULL(sb))
cifs_sb_deactive(sb);
}
struct super_block *cifs_get_dfs_tcon_super(struct cifs_tcon *tcon)
{
spin_lock(&tcon->tc_lock);
if (!tcon->origin_fullpath) {
spin_unlock(&tcon->tc_lock);
return ERR_PTR(-ENOENT);
}
spin_unlock(&tcon->tc_lock);
return __cifs_get_super(tcon_super_cb, tcon);
}
void cifs_put_tcp_super(struct super_block *sb)
{
__cifs_put_super(sb);
}
#ifdef CONFIG_CIFS_DFS_UPCALL
int match_target_ip(struct TCP_Server_Info *server,
const char *share, size_t share_len,
bool *result)
{
int rc;
char *target;
struct sockaddr_storage ss;
*result = false;
target = kzalloc(share_len + 3, GFP_KERNEL);
if (!target)
return -ENOMEM;
scnprintf(target, share_len + 3, "\\\\%.*s", (int)share_len, share);
cifs_dbg(FYI, "%s: target name: %s\n", __func__, target + 2);
rc = dns_resolve_server_name_to_ip(target, (struct sockaddr *)&ss, NULL);
kfree(target);
if (rc < 0)
return rc;
spin_lock(&server->srv_lock);
*result = cifs_match_ipaddr((struct sockaddr *)&server->dstaddr, (struct sockaddr *)&ss);
spin_unlock(&server->srv_lock);
cifs_dbg(FYI, "%s: ip addresses match: %u\n", __func__, *result);
return 0;
}
int cifs_update_super_prepath(struct cifs_sb_info *cifs_sb, char *prefix)
{
int rc;
kfree(cifs_sb->prepath);
cifs_sb->prepath = NULL;
if (prefix && *prefix) {
cifs_sb->prepath = cifs_sanitize_prepath(prefix, GFP_ATOMIC);
if (IS_ERR(cifs_sb->prepath)) {
rc = PTR_ERR(cifs_sb->prepath);
cifs_sb->prepath = NULL;
return rc;
}
if (cifs_sb->prepath)
convert_delimiter(cifs_sb->prepath, CIFS_DIR_SEP(cifs_sb));
}
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
return 0;
}
/*
* Handle weird Windows SMB server behaviour. It responds with
* STATUS_OBJECT_NAME_INVALID code to SMB2 QUERY_INFO request for
* "\<server>\<dfsname>\<linkpath>" DFS reference, where <dfsname> contains
* non-ASCII unicode symbols.
*/
int cifs_inval_name_dfs_link_error(const unsigned int xid,
struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb,
const char *full_path,
bool *islink)
{
struct cifs_ses *ses = tcon->ses;
size_t len;
char *path;
char *ref_path;
*islink = false;
/*
* Fast path - skip check when @full_path doesn't have a prefix path to
* look up or tcon is not DFS.
*/
if (strlen(full_path) < 2 || !cifs_sb ||
(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) ||
!is_tcon_dfs(tcon))
return 0;
spin_lock(&tcon->tc_lock);
if (!tcon->origin_fullpath) {
spin_unlock(&tcon->tc_lock);
return 0;
}
spin_unlock(&tcon->tc_lock);
/*
* Slow path - tcon is DFS and @full_path has prefix path, so attempt
* to get a referral to figure out whether it is an DFS link.
*/
len = strnlen(tcon->tree_name, MAX_TREE_SIZE + 1) + strlen(full_path) + 1;
path = kmalloc(len, GFP_KERNEL);
if (!path)
return -ENOMEM;
scnprintf(path, len, "%s%s", tcon->tree_name, full_path);
ref_path = dfs_cache_canonical_path(path + 1, cifs_sb->local_nls,
cifs_remap(cifs_sb));
kfree(path);
if (IS_ERR(ref_path)) {
if (PTR_ERR(ref_path) != -EINVAL)
return PTR_ERR(ref_path);
} else {
struct dfs_info3_param *refs = NULL;
int num_refs = 0;
/*
* XXX: we are not using dfs_cache_find() here because we might
* end up filling all the DFS cache and thus potentially
* removing cached DFS targets that the client would eventually
* need during failover.
*/
ses = CIFS_DFS_ROOT_SES(ses);
if (ses->server->ops->get_dfs_refer &&
!ses->server->ops->get_dfs_refer(xid, ses, ref_path, &refs,
&num_refs, cifs_sb->local_nls,
cifs_remap(cifs_sb)))
*islink = refs[0].server_type == DFS_TYPE_LINK;
free_dfs_info_array(refs, num_refs);
kfree(ref_path);
}
return 0;
}
#endif
int cifs_wait_for_server_reconnect(struct TCP_Server_Info *server, bool retry)
{
int timeout = 10;
int rc;
spin_lock(&server->srv_lock);
if (server->tcpStatus != CifsNeedReconnect) {
spin_unlock(&server->srv_lock);
return 0;
}
timeout *= server->nr_targets;
spin_unlock(&server->srv_lock);
/*
* Give demultiplex thread up to 10 seconds to each target available for
* reconnect -- should be greater than cifs socket timeout which is 7
* seconds.
*
* On "soft" mounts we wait once. Hard mounts keep retrying until
* process is killed or server comes back on-line.
*/
do {
rc = wait_event_interruptible_timeout(server->response_q,
(server->tcpStatus != CifsNeedReconnect),
timeout * HZ);
if (rc < 0) {
cifs_dbg(FYI, "%s: aborting reconnect due to received signal\n",
__func__);
return -ERESTARTSYS;
}
/* are we still trying to reconnect? */
spin_lock(&server->srv_lock);
if (server->tcpStatus != CifsNeedReconnect) {
spin_unlock(&server->srv_lock);
return 0;
}
spin_unlock(&server->srv_lock);
} while (retry);
cifs_dbg(FYI, "%s: gave up waiting on reconnect\n", __func__);
return -EHOSTDOWN;
}