2020-11-13 06:01:16 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
/* Copyright(c) 2016-20 Intel Corporation. */
|
|
|
|
|
2021-03-19 15:23:09 +08:00
|
|
|
#include <linux/file.h>
|
2020-11-13 06:01:16 +08:00
|
|
|
#include <linux/freezer.h>
|
|
|
|
#include <linux/highmem.h>
|
|
|
|
#include <linux/kthread.h>
|
2021-03-19 15:23:09 +08:00
|
|
|
#include <linux/miscdevice.h>
|
2022-01-05 01:15:27 +08:00
|
|
|
#include <linux/node.h>
|
2020-11-13 06:01:16 +08:00
|
|
|
#include <linux/pagemap.h>
|
|
|
|
#include <linux/ratelimit.h>
|
|
|
|
#include <linux/sched/mm.h>
|
|
|
|
#include <linux/sched/signal.h>
|
|
|
|
#include <linux/slab.h>
|
2022-01-05 01:15:27 +08:00
|
|
|
#include <linux/sysfs.h>
|
2021-03-19 15:23:09 +08:00
|
|
|
#include <asm/sgx.h>
|
2020-11-13 06:01:22 +08:00
|
|
|
#include "driver.h"
|
|
|
|
#include "encl.h"
|
2020-11-13 06:01:16 +08:00
|
|
|
#include "encls.h"
|
|
|
|
|
|
|
|
struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
|
|
|
|
static int sgx_nr_epc_sections;
|
|
|
|
static struct task_struct *ksgxd_tsk;
|
x86/sgx: Add a page reclaimer
Just like normal RAM, there is a limited amount of enclave memory available
and overcommitting it is a very valuable tool to reduce resource use.
Introduce a simple reclaim mechanism for enclave pages.
In contrast to normal page reclaim, the kernel cannot directly access
enclave memory. To get around this, the SGX architecture provides a set of
functions to help. Among other things, these functions copy enclave memory
to and from normal memory, encrypting it and protecting its integrity in
the process.
Implement a page reclaimer by using these functions. Picks victim pages in
LRU fashion from all the enclaves running in the system. A new kernel
thread (ksgxswapd) reclaims pages in the background based on watermarks,
similar to normal kswapd.
All enclave pages can be reclaimed, architecturally. But, there are some
limits to this, such as the special SECS metadata page which must be
reclaimed last. The page version array (used to mitigate replaying old
reclaimed pages) is also architecturally reclaimable, but not yet
implemented. The end result is that the vast majority of enclave pages are
currently reclaimable.
Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jethro Beekman <jethro@fortanix.com>
Link: https://lkml.kernel.org/r/20201112220135.165028-22-jarkko@kernel.org
2020-11-13 06:01:32 +08:00
|
|
|
static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
|
2021-10-27 06:00:45 +08:00
|
|
|
static DEFINE_XARRAY(sgx_epc_address_space);
|
x86/sgx: Add a page reclaimer
Just like normal RAM, there is a limited amount of enclave memory available
and overcommitting it is a very valuable tool to reduce resource use.
Introduce a simple reclaim mechanism for enclave pages.
In contrast to normal page reclaim, the kernel cannot directly access
enclave memory. To get around this, the SGX architecture provides a set of
functions to help. Among other things, these functions copy enclave memory
to and from normal memory, encrypting it and protecting its integrity in
the process.
Implement a page reclaimer by using these functions. Picks victim pages in
LRU fashion from all the enclaves running in the system. A new kernel
thread (ksgxswapd) reclaims pages in the background based on watermarks,
similar to normal kswapd.
All enclave pages can be reclaimed, architecturally. But, there are some
limits to this, such as the special SECS metadata page which must be
reclaimed last. The page version array (used to mitigate replaying old
reclaimed pages) is also architecturally reclaimable, but not yet
implemented. The end result is that the vast majority of enclave pages are
currently reclaimable.
Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jethro Beekman <jethro@fortanix.com>
Link: https://lkml.kernel.org/r/20201112220135.165028-22-jarkko@kernel.org
2020-11-13 06:01:32 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* These variables are part of the state of the reclaimer, and must be accessed
|
|
|
|
* with sgx_reclaimer_lock acquired.
|
|
|
|
*/
|
|
|
|
static LIST_HEAD(sgx_active_page_list);
|
|
|
|
static DEFINE_SPINLOCK(sgx_reclaimer_lock);
|
2020-11-13 06:01:16 +08:00
|
|
|
|
x86/sgx: Fix free page accounting
The SGX driver maintains a single global free page counter,
sgx_nr_free_pages, that reflects the number of free pages available
across all NUMA nodes. Correspondingly, a list of free pages is
associated with each NUMA node and sgx_nr_free_pages is updated
every time a page is added or removed from any of the free page
lists. The main usage of sgx_nr_free_pages is by the reclaimer
that runs when it (sgx_nr_free_pages) goes below a watermark
to ensure that there are always some free pages available to, for
example, support efficient page faults.
With sgx_nr_free_pages accessed and modified from a few places
it is essential to ensure that these accesses are done safely but
this is not the case. sgx_nr_free_pages is read without any
protection and updated with inconsistent protection by any one
of the spin locks associated with the individual NUMA nodes.
For example:
CPU_A CPU_B
----- -----
spin_lock(&nodeA->lock); spin_lock(&nodeB->lock);
... ...
sgx_nr_free_pages--; /* NOT SAFE */ sgx_nr_free_pages--;
spin_unlock(&nodeA->lock); spin_unlock(&nodeB->lock);
Since sgx_nr_free_pages may be protected by different spin locks
while being modified from different CPUs, the following scenario
is possible:
CPU_A CPU_B
----- -----
{sgx_nr_free_pages = 100}
spin_lock(&nodeA->lock); spin_lock(&nodeB->lock);
sgx_nr_free_pages--; sgx_nr_free_pages--;
/* LOAD sgx_nr_free_pages = 100 */ /* LOAD sgx_nr_free_pages = 100 */
/* sgx_nr_free_pages-- */ /* sgx_nr_free_pages-- */
/* STORE sgx_nr_free_pages = 99 */ /* STORE sgx_nr_free_pages = 99 */
spin_unlock(&nodeA->lock); spin_unlock(&nodeB->lock);
In the above scenario, sgx_nr_free_pages is decremented from two CPUs
but instead of sgx_nr_free_pages ending with a value that is two less
than it started with, it was only decremented by one while the number
of free pages were actually reduced by two. The consequence of
sgx_nr_free_pages not being protected is that its value may not
accurately reflect the actual number of free pages on the system,
impacting the availability of free pages in support of many flows.
The problematic scenario is when the reclaimer does not run because it
believes there to be sufficient free pages while any attempt to allocate
a page fails because there are no free pages available. In the SGX driver
the reclaimer's watermark is only 32 pages so after encountering the
above example scenario 32 times a user space hang is possible when there
are no more free pages because of repeated page faults caused by no
free pages made available.
The following flow was encountered:
asm_exc_page_fault
...
sgx_vma_fault()
sgx_encl_load_page()
sgx_encl_eldu() // Encrypted page needs to be loaded from backing
// storage into newly allocated SGX memory page
sgx_alloc_epc_page() // Allocate a page of SGX memory
__sgx_alloc_epc_page() // Fails, no free SGX memory
...
if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) // Wake reclaimer
wake_up(&ksgxd_waitq);
return -EBUSY; // Return -EBUSY giving reclaimer time to run
return -EBUSY;
return -EBUSY;
return VM_FAULT_NOPAGE;
The reclaimer is triggered in above flow with the following code:
static bool sgx_should_reclaim(unsigned long watermark)
{
return sgx_nr_free_pages < watermark &&
!list_empty(&sgx_active_page_list);
}
In the problematic scenario there were no free pages available yet the
value of sgx_nr_free_pages was above the watermark. The allocation of
SGX memory thus always failed because of a lack of free pages while no
free pages were made available because the reclaimer is never started
because of sgx_nr_free_pages' incorrect value. The consequence was that
user space kept encountering VM_FAULT_NOPAGE that caused the same
address to be accessed repeatedly with the same result.
Change the global free page counter to an atomic type that
ensures simultaneous updates are done safely. While doing so, move
the updating of the variable outside of the spin lock critical
section to which it does not belong.
Cc: stable@vger.kernel.org
Fixes: 901ddbb9ecf5 ("x86/sgx: Add a basic NUMA allocation scheme to sgx_alloc_epc_page()")
Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lkml.kernel.org/r/a95a40743bbd3f795b465f30922dde7f1ea9e0eb.1637004094.git.reinette.chatre@intel.com
2021-11-16 03:29:04 +08:00
|
|
|
static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0);
|
2021-03-18 07:53:31 +08:00
|
|
|
|
|
|
|
/* Nodes with one or more EPC sections. */
|
|
|
|
static nodemask_t sgx_numa_mask;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Array with one list_head for each possible NUMA node. Each
|
|
|
|
* list contains all the sgx_epc_section's which are on that
|
|
|
|
* node.
|
|
|
|
*/
|
|
|
|
static struct sgx_numa_node *sgx_numa_nodes;
|
|
|
|
|
2021-03-18 07:53:30 +08:00
|
|
|
static LIST_HEAD(sgx_dirty_page_list);
|
|
|
|
|
2020-11-13 06:01:16 +08:00
|
|
|
/*
|
2021-03-18 07:53:30 +08:00
|
|
|
* Reset post-kexec EPC pages to the uninitialized state. The pages are removed
|
|
|
|
* from the input list, and made available for the page allocator. SECS pages
|
|
|
|
* prepending their children in the input list are left intact.
|
2020-11-13 06:01:16 +08:00
|
|
|
*/
|
2021-03-18 07:53:30 +08:00
|
|
|
static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
|
2020-11-13 06:01:16 +08:00
|
|
|
{
|
|
|
|
struct sgx_epc_page *page;
|
|
|
|
LIST_HEAD(dirty);
|
|
|
|
int ret;
|
|
|
|
|
2021-03-18 07:53:30 +08:00
|
|
|
/* dirty_page_list is thread-local, no need for a lock: */
|
|
|
|
while (!list_empty(dirty_page_list)) {
|
2020-11-13 06:01:16 +08:00
|
|
|
if (kthread_should_stop())
|
|
|
|
return;
|
|
|
|
|
2021-03-18 07:53:30 +08:00
|
|
|
page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
|
2020-11-13 06:01:16 +08:00
|
|
|
|
2021-10-27 06:00:46 +08:00
|
|
|
/*
|
|
|
|
* Checking page->poison without holding the node->lock
|
|
|
|
* is racy, but losing the race (i.e. poison is set just
|
|
|
|
* after the check) just means __eremove() will be uselessly
|
|
|
|
* called for a page that sgx_free_epc_page() will put onto
|
|
|
|
* the node->sgx_poison_page_list later.
|
|
|
|
*/
|
|
|
|
if (page->poison) {
|
|
|
|
struct sgx_epc_section *section = &sgx_epc_sections[page->section];
|
|
|
|
struct sgx_numa_node *node = section->node;
|
|
|
|
|
|
|
|
spin_lock(&node->lock);
|
|
|
|
list_move(&page->list, &node->sgx_poison_page_list);
|
|
|
|
spin_unlock(&node->lock);
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2020-11-13 06:01:16 +08:00
|
|
|
ret = __eremove(sgx_get_epc_virt_addr(page));
|
2021-03-18 07:53:30 +08:00
|
|
|
if (!ret) {
|
|
|
|
/*
|
|
|
|
* page is now sanitized. Make it available via the SGX
|
|
|
|
* page allocator:
|
|
|
|
*/
|
|
|
|
list_del(&page->list);
|
|
|
|
sgx_free_epc_page(page);
|
|
|
|
} else {
|
|
|
|
/* The page is not yet clean - move to the dirty list. */
|
2020-11-13 06:01:16 +08:00
|
|
|
list_move_tail(&page->list, &dirty);
|
2021-03-18 07:53:30 +08:00
|
|
|
}
|
2020-11-13 06:01:16 +08:00
|
|
|
|
|
|
|
cond_resched();
|
|
|
|
}
|
|
|
|
|
2021-03-18 07:53:30 +08:00
|
|
|
list_splice(&dirty, dirty_page_list);
|
2020-11-13 06:01:16 +08:00
|
|
|
}
|
|
|
|
|
x86/sgx: Add a page reclaimer
Just like normal RAM, there is a limited amount of enclave memory available
and overcommitting it is a very valuable tool to reduce resource use.
Introduce a simple reclaim mechanism for enclave pages.
In contrast to normal page reclaim, the kernel cannot directly access
enclave memory. To get around this, the SGX architecture provides a set of
functions to help. Among other things, these functions copy enclave memory
to and from normal memory, encrypting it and protecting its integrity in
the process.
Implement a page reclaimer by using these functions. Picks victim pages in
LRU fashion from all the enclaves running in the system. A new kernel
thread (ksgxswapd) reclaims pages in the background based on watermarks,
similar to normal kswapd.
All enclave pages can be reclaimed, architecturally. But, there are some
limits to this, such as the special SECS metadata page which must be
reclaimed last. The page version array (used to mitigate replaying old
reclaimed pages) is also architecturally reclaimable, but not yet
implemented. The end result is that the vast majority of enclave pages are
currently reclaimable.
Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jethro Beekman <jethro@fortanix.com>
Link: https://lkml.kernel.org/r/20201112220135.165028-22-jarkko@kernel.org
2020-11-13 06:01:32 +08:00
|
|
|
static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page)
|
|
|
|
{
|
|
|
|
struct sgx_encl_page *page = epc_page->owner;
|
|
|
|
struct sgx_encl *encl = page->encl;
|
|
|
|
struct sgx_encl_mm *encl_mm;
|
|
|
|
bool ret = true;
|
|
|
|
int idx;
|
|
|
|
|
|
|
|
idx = srcu_read_lock(&encl->srcu);
|
|
|
|
|
|
|
|
list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
|
|
|
|
if (!mmget_not_zero(encl_mm->mm))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
mmap_read_lock(encl_mm->mm);
|
|
|
|
ret = !sgx_encl_test_and_clear_young(encl_mm->mm, page);
|
|
|
|
mmap_read_unlock(encl_mm->mm);
|
|
|
|
|
|
|
|
mmput_async(encl_mm->mm);
|
|
|
|
|
|
|
|
if (!ret)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
srcu_read_unlock(&encl->srcu, idx);
|
|
|
|
|
|
|
|
if (!ret)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void sgx_reclaimer_block(struct sgx_epc_page *epc_page)
|
|
|
|
{
|
|
|
|
struct sgx_encl_page *page = epc_page->owner;
|
|
|
|
unsigned long addr = page->desc & PAGE_MASK;
|
|
|
|
struct sgx_encl *encl = page->encl;
|
|
|
|
unsigned long mm_list_version;
|
|
|
|
struct sgx_encl_mm *encl_mm;
|
|
|
|
struct vm_area_struct *vma;
|
|
|
|
int idx, ret;
|
|
|
|
|
|
|
|
do {
|
|
|
|
mm_list_version = encl->mm_list_version;
|
|
|
|
|
|
|
|
/* Pairs with smp_rmb() in sgx_encl_mm_add(). */
|
|
|
|
smp_rmb();
|
|
|
|
|
|
|
|
idx = srcu_read_lock(&encl->srcu);
|
|
|
|
|
|
|
|
list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
|
|
|
|
if (!mmget_not_zero(encl_mm->mm))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
mmap_read_lock(encl_mm->mm);
|
|
|
|
|
|
|
|
ret = sgx_encl_find(encl_mm->mm, addr, &vma);
|
|
|
|
if (!ret && encl == vma->vm_private_data)
|
|
|
|
zap_vma_ptes(vma, addr, PAGE_SIZE);
|
|
|
|
|
|
|
|
mmap_read_unlock(encl_mm->mm);
|
|
|
|
|
|
|
|
mmput_async(encl_mm->mm);
|
|
|
|
}
|
|
|
|
|
|
|
|
srcu_read_unlock(&encl->srcu, idx);
|
|
|
|
} while (unlikely(encl->mm_list_version != mm_list_version));
|
|
|
|
|
|
|
|
mutex_lock(&encl->lock);
|
|
|
|
|
|
|
|
ret = __eblock(sgx_get_epc_virt_addr(epc_page));
|
|
|
|
if (encls_failed(ret))
|
|
|
|
ENCLS_WARN(ret, "EBLOCK");
|
|
|
|
|
|
|
|
mutex_unlock(&encl->lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot,
|
|
|
|
struct sgx_backing *backing)
|
|
|
|
{
|
|
|
|
struct sgx_pageinfo pginfo;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
pginfo.addr = 0;
|
|
|
|
pginfo.secs = 0;
|
|
|
|
|
|
|
|
pginfo.contents = (unsigned long)kmap_atomic(backing->contents);
|
|
|
|
pginfo.metadata = (unsigned long)kmap_atomic(backing->pcmd) +
|
|
|
|
backing->pcmd_offset;
|
|
|
|
|
|
|
|
ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot);
|
|
|
|
|
|
|
|
kunmap_atomic((void *)(unsigned long)(pginfo.metadata -
|
|
|
|
backing->pcmd_offset));
|
|
|
|
kunmap_atomic((void *)(unsigned long)pginfo.contents);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void sgx_ipi_cb(void *info)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl)
|
|
|
|
{
|
|
|
|
cpumask_t *cpumask = &encl->cpumask;
|
|
|
|
struct sgx_encl_mm *encl_mm;
|
|
|
|
int idx;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Can race with sgx_encl_mm_add(), but ETRACK has already been
|
|
|
|
* executed, which means that the CPUs running in the new mm will enter
|
|
|
|
* into the enclave with a fresh epoch.
|
|
|
|
*/
|
|
|
|
cpumask_clear(cpumask);
|
|
|
|
|
|
|
|
idx = srcu_read_lock(&encl->srcu);
|
|
|
|
|
|
|
|
list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
|
|
|
|
if (!mmget_not_zero(encl_mm->mm))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm));
|
|
|
|
|
|
|
|
mmput_async(encl_mm->mm);
|
|
|
|
}
|
|
|
|
|
|
|
|
srcu_read_unlock(&encl->srcu, idx);
|
|
|
|
|
|
|
|
return cpumask;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Swap page to the regular memory transformed to the blocked state by using
|
2021-03-18 22:28:01 +08:00
|
|
|
* EBLOCK, which means that it can no longer be referenced (no new TLB entries).
|
x86/sgx: Add a page reclaimer
Just like normal RAM, there is a limited amount of enclave memory available
and overcommitting it is a very valuable tool to reduce resource use.
Introduce a simple reclaim mechanism for enclave pages.
In contrast to normal page reclaim, the kernel cannot directly access
enclave memory. To get around this, the SGX architecture provides a set of
functions to help. Among other things, these functions copy enclave memory
to and from normal memory, encrypting it and protecting its integrity in
the process.
Implement a page reclaimer by using these functions. Picks victim pages in
LRU fashion from all the enclaves running in the system. A new kernel
thread (ksgxswapd) reclaims pages in the background based on watermarks,
similar to normal kswapd.
All enclave pages can be reclaimed, architecturally. But, there are some
limits to this, such as the special SECS metadata page which must be
reclaimed last. The page version array (used to mitigate replaying old
reclaimed pages) is also architecturally reclaimable, but not yet
implemented. The end result is that the vast majority of enclave pages are
currently reclaimable.
Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jethro Beekman <jethro@fortanix.com>
Link: https://lkml.kernel.org/r/20201112220135.165028-22-jarkko@kernel.org
2020-11-13 06:01:32 +08:00
|
|
|
*
|
|
|
|
* The first trial just tries to write the page assuming that some other thread
|
2021-03-18 22:28:01 +08:00
|
|
|
* has reset the count for threads inside the enclave by using ETRACK, and
|
x86/sgx: Add a page reclaimer
Just like normal RAM, there is a limited amount of enclave memory available
and overcommitting it is a very valuable tool to reduce resource use.
Introduce a simple reclaim mechanism for enclave pages.
In contrast to normal page reclaim, the kernel cannot directly access
enclave memory. To get around this, the SGX architecture provides a set of
functions to help. Among other things, these functions copy enclave memory
to and from normal memory, encrypting it and protecting its integrity in
the process.
Implement a page reclaimer by using these functions. Picks victim pages in
LRU fashion from all the enclaves running in the system. A new kernel
thread (ksgxswapd) reclaims pages in the background based on watermarks,
similar to normal kswapd.
All enclave pages can be reclaimed, architecturally. But, there are some
limits to this, such as the special SECS metadata page which must be
reclaimed last. The page version array (used to mitigate replaying old
reclaimed pages) is also architecturally reclaimable, but not yet
implemented. The end result is that the vast majority of enclave pages are
currently reclaimable.
Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jethro Beekman <jethro@fortanix.com>
Link: https://lkml.kernel.org/r/20201112220135.165028-22-jarkko@kernel.org
2020-11-13 06:01:32 +08:00
|
|
|
* previous thread count has been zeroed out. The second trial calls ETRACK
|
|
|
|
* before EWB. If that fails we kick all the HW threads out, and then do EWB,
|
|
|
|
* which should be guaranteed the succeed.
|
|
|
|
*/
|
|
|
|
static void sgx_encl_ewb(struct sgx_epc_page *epc_page,
|
|
|
|
struct sgx_backing *backing)
|
|
|
|
{
|
|
|
|
struct sgx_encl_page *encl_page = epc_page->owner;
|
|
|
|
struct sgx_encl *encl = encl_page->encl;
|
|
|
|
struct sgx_va_page *va_page;
|
|
|
|
unsigned int va_offset;
|
|
|
|
void *va_slot;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED;
|
|
|
|
|
|
|
|
va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
|
|
|
|
list);
|
|
|
|
va_offset = sgx_alloc_va_slot(va_page);
|
|
|
|
va_slot = sgx_get_epc_virt_addr(va_page->epc_page) + va_offset;
|
|
|
|
if (sgx_va_page_full(va_page))
|
|
|
|
list_move_tail(&va_page->list, &encl->va_pages);
|
|
|
|
|
|
|
|
ret = __sgx_encl_ewb(epc_page, va_slot, backing);
|
|
|
|
if (ret == SGX_NOT_TRACKED) {
|
|
|
|
ret = __etrack(sgx_get_epc_virt_addr(encl->secs.epc_page));
|
|
|
|
if (ret) {
|
|
|
|
if (encls_failed(ret))
|
|
|
|
ENCLS_WARN(ret, "ETRACK");
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = __sgx_encl_ewb(epc_page, va_slot, backing);
|
|
|
|
if (ret == SGX_NOT_TRACKED) {
|
|
|
|
/*
|
|
|
|
* Slow path, send IPIs to kick cpus out of the
|
|
|
|
* enclave. Note, it's imperative that the cpu
|
|
|
|
* mask is generated *after* ETRACK, else we'll
|
|
|
|
* miss cpus that entered the enclave between
|
|
|
|
* generating the mask and incrementing epoch.
|
|
|
|
*/
|
|
|
|
on_each_cpu_mask(sgx_encl_ewb_cpumask(encl),
|
|
|
|
sgx_ipi_cb, NULL, 1);
|
|
|
|
ret = __sgx_encl_ewb(epc_page, va_slot, backing);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ret) {
|
|
|
|
if (encls_failed(ret))
|
|
|
|
ENCLS_WARN(ret, "EWB");
|
|
|
|
|
|
|
|
sgx_free_va_slot(va_page, va_offset);
|
|
|
|
} else {
|
|
|
|
encl_page->desc |= va_offset;
|
|
|
|
encl_page->va_page = va_page;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
|
|
|
|
struct sgx_backing *backing)
|
|
|
|
{
|
|
|
|
struct sgx_encl_page *encl_page = epc_page->owner;
|
|
|
|
struct sgx_encl *encl = encl_page->encl;
|
|
|
|
struct sgx_backing secs_backing;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
mutex_lock(&encl->lock);
|
|
|
|
|
|
|
|
sgx_encl_ewb(epc_page, backing);
|
|
|
|
encl_page->epc_page = NULL;
|
|
|
|
encl->secs_child_cnt--;
|
|
|
|
|
|
|
|
if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) {
|
|
|
|
ret = sgx_encl_get_backing(encl, PFN_DOWN(encl->size),
|
|
|
|
&secs_backing);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
sgx_encl_ewb(encl->secs.epc_page, &secs_backing);
|
|
|
|
|
x86/sgx: Wipe out EREMOVE from sgx_free_epc_page()
EREMOVE takes a page and removes any association between that page and
an enclave. It must be run on a page before it can be added into another
enclave. Currently, EREMOVE is run as part of pages being freed into the
SGX page allocator. It is not expected to fail, as it would indicate a
use-after-free of EPC pages. Rather than add the page back to the pool
of available EPC pages, the kernel intentionally leaks the page to avoid
additional errors in the future.
However, KVM does not track how guest pages are used, which means that
SGX virtualization use of EREMOVE might fail. Specifically, it is
legitimate that EREMOVE returns SGX_CHILD_PRESENT for EPC assigned to
KVM guest, because KVM/kernel doesn't track SECS pages.
To allow SGX/KVM to introduce a more permissive EREMOVE helper and
to let the SGX virtualization code use the allocator directly, break
out the EREMOVE call from the SGX page allocator. Rename the original
sgx_free_epc_page() to sgx_encl_free_epc_page(), indicating that
it is used to free an EPC page assigned to a host enclave. Replace
sgx_free_epc_page() with sgx_encl_free_epc_page() in all call sites so
there's no functional change.
At the same time, improve the error message when EREMOVE fails, and
add documentation to explain to the user what that failure means and
to suggest to the user what to do when this bug happens in the case it
happens.
[ bp: Massage commit message, fix typos and sanitize text, simplify. ]
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lkml.kernel.org/r/20210325093057.122834-1-kai.huang@intel.com
2021-03-25 17:30:57 +08:00
|
|
|
sgx_encl_free_epc_page(encl->secs.epc_page);
|
x86/sgx: Add a page reclaimer
Just like normal RAM, there is a limited amount of enclave memory available
and overcommitting it is a very valuable tool to reduce resource use.
Introduce a simple reclaim mechanism for enclave pages.
In contrast to normal page reclaim, the kernel cannot directly access
enclave memory. To get around this, the SGX architecture provides a set of
functions to help. Among other things, these functions copy enclave memory
to and from normal memory, encrypting it and protecting its integrity in
the process.
Implement a page reclaimer by using these functions. Picks victim pages in
LRU fashion from all the enclaves running in the system. A new kernel
thread (ksgxswapd) reclaims pages in the background based on watermarks,
similar to normal kswapd.
All enclave pages can be reclaimed, architecturally. But, there are some
limits to this, such as the special SECS metadata page which must be
reclaimed last. The page version array (used to mitigate replaying old
reclaimed pages) is also architecturally reclaimable, but not yet
implemented. The end result is that the vast majority of enclave pages are
currently reclaimable.
Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jethro Beekman <jethro@fortanix.com>
Link: https://lkml.kernel.org/r/20201112220135.165028-22-jarkko@kernel.org
2020-11-13 06:01:32 +08:00
|
|
|
encl->secs.epc_page = NULL;
|
|
|
|
|
|
|
|
sgx_encl_put_backing(&secs_backing, true);
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
mutex_unlock(&encl->lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Take a fixed number of pages from the head of the active page pool and
|
|
|
|
* reclaim them to the enclave's private shmem files. Skip the pages, which have
|
|
|
|
* been accessed since the last scan. Move those pages to the tail of active
|
|
|
|
* page pool so that the pages get scanned in LRU like fashion.
|
|
|
|
*
|
|
|
|
* Batch process a chunk of pages (at the moment 16) in order to degrade amount
|
|
|
|
* of IPI's and ETRACK's potentially required. sgx_encl_ewb() does degrade a bit
|
|
|
|
* among the HW threads with three stage EWB pipeline (EWB, ETRACK + EWB and IPI
|
|
|
|
* + EWB) but not sufficiently. Reclaiming one page at a time would also be
|
|
|
|
* problematic as it would increase the lock contention too much, which would
|
|
|
|
* halt forward progress.
|
|
|
|
*/
|
|
|
|
static void sgx_reclaim_pages(void)
|
|
|
|
{
|
|
|
|
struct sgx_epc_page *chunk[SGX_NR_TO_SCAN];
|
|
|
|
struct sgx_backing backing[SGX_NR_TO_SCAN];
|
|
|
|
struct sgx_encl_page *encl_page;
|
|
|
|
struct sgx_epc_page *epc_page;
|
|
|
|
pgoff_t page_index;
|
|
|
|
int cnt = 0;
|
|
|
|
int ret;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
spin_lock(&sgx_reclaimer_lock);
|
|
|
|
for (i = 0; i < SGX_NR_TO_SCAN; i++) {
|
|
|
|
if (list_empty(&sgx_active_page_list))
|
|
|
|
break;
|
|
|
|
|
|
|
|
epc_page = list_first_entry(&sgx_active_page_list,
|
|
|
|
struct sgx_epc_page, list);
|
|
|
|
list_del_init(&epc_page->list);
|
|
|
|
encl_page = epc_page->owner;
|
|
|
|
|
|
|
|
if (kref_get_unless_zero(&encl_page->encl->refcount) != 0)
|
|
|
|
chunk[cnt++] = epc_page;
|
|
|
|
else
|
|
|
|
/* The owner is freeing the page. No need to add the
|
|
|
|
* page back to the list of reclaimable pages.
|
|
|
|
*/
|
|
|
|
epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
|
|
|
|
}
|
|
|
|
spin_unlock(&sgx_reclaimer_lock);
|
|
|
|
|
|
|
|
for (i = 0; i < cnt; i++) {
|
|
|
|
epc_page = chunk[i];
|
|
|
|
encl_page = epc_page->owner;
|
|
|
|
|
|
|
|
if (!sgx_reclaimer_age(epc_page))
|
|
|
|
goto skip;
|
|
|
|
|
|
|
|
page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
|
|
|
|
ret = sgx_encl_get_backing(encl_page->encl, page_index, &backing[i]);
|
|
|
|
if (ret)
|
|
|
|
goto skip;
|
|
|
|
|
|
|
|
mutex_lock(&encl_page->encl->lock);
|
|
|
|
encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED;
|
|
|
|
mutex_unlock(&encl_page->encl->lock);
|
|
|
|
continue;
|
|
|
|
|
|
|
|
skip:
|
|
|
|
spin_lock(&sgx_reclaimer_lock);
|
|
|
|
list_add_tail(&epc_page->list, &sgx_active_page_list);
|
|
|
|
spin_unlock(&sgx_reclaimer_lock);
|
|
|
|
|
|
|
|
kref_put(&encl_page->encl->refcount, sgx_encl_release);
|
|
|
|
|
|
|
|
chunk[i] = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < cnt; i++) {
|
|
|
|
epc_page = chunk[i];
|
|
|
|
if (epc_page)
|
|
|
|
sgx_reclaimer_block(epc_page);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < cnt; i++) {
|
|
|
|
epc_page = chunk[i];
|
|
|
|
if (!epc_page)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
encl_page = epc_page->owner;
|
|
|
|
sgx_reclaimer_write(epc_page, &backing[i]);
|
|
|
|
sgx_encl_put_backing(&backing[i], true);
|
|
|
|
|
|
|
|
kref_put(&encl_page->encl->refcount, sgx_encl_release);
|
|
|
|
epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
|
|
|
|
|
2022-02-03 03:41:12 +08:00
|
|
|
sgx_free_epc_page(epc_page);
|
2021-03-18 07:53:31 +08:00
|
|
|
}
|
x86/sgx: Add a page reclaimer
Just like normal RAM, there is a limited amount of enclave memory available
and overcommitting it is a very valuable tool to reduce resource use.
Introduce a simple reclaim mechanism for enclave pages.
In contrast to normal page reclaim, the kernel cannot directly access
enclave memory. To get around this, the SGX architecture provides a set of
functions to help. Among other things, these functions copy enclave memory
to and from normal memory, encrypting it and protecting its integrity in
the process.
Implement a page reclaimer by using these functions. Picks victim pages in
LRU fashion from all the enclaves running in the system. A new kernel
thread (ksgxswapd) reclaims pages in the background based on watermarks,
similar to normal kswapd.
All enclave pages can be reclaimed, architecturally. But, there are some
limits to this, such as the special SECS metadata page which must be
reclaimed last. The page version array (used to mitigate replaying old
reclaimed pages) is also architecturally reclaimable, but not yet
implemented. The end result is that the vast majority of enclave pages are
currently reclaimable.
Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jethro Beekman <jethro@fortanix.com>
Link: https://lkml.kernel.org/r/20201112220135.165028-22-jarkko@kernel.org
2020-11-13 06:01:32 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static bool sgx_should_reclaim(unsigned long watermark)
|
|
|
|
{
|
x86/sgx: Fix free page accounting
The SGX driver maintains a single global free page counter,
sgx_nr_free_pages, that reflects the number of free pages available
across all NUMA nodes. Correspondingly, a list of free pages is
associated with each NUMA node and sgx_nr_free_pages is updated
every time a page is added or removed from any of the free page
lists. The main usage of sgx_nr_free_pages is by the reclaimer
that runs when it (sgx_nr_free_pages) goes below a watermark
to ensure that there are always some free pages available to, for
example, support efficient page faults.
With sgx_nr_free_pages accessed and modified from a few places
it is essential to ensure that these accesses are done safely but
this is not the case. sgx_nr_free_pages is read without any
protection and updated with inconsistent protection by any one
of the spin locks associated with the individual NUMA nodes.
For example:
CPU_A CPU_B
----- -----
spin_lock(&nodeA->lock); spin_lock(&nodeB->lock);
... ...
sgx_nr_free_pages--; /* NOT SAFE */ sgx_nr_free_pages--;
spin_unlock(&nodeA->lock); spin_unlock(&nodeB->lock);
Since sgx_nr_free_pages may be protected by different spin locks
while being modified from different CPUs, the following scenario
is possible:
CPU_A CPU_B
----- -----
{sgx_nr_free_pages = 100}
spin_lock(&nodeA->lock); spin_lock(&nodeB->lock);
sgx_nr_free_pages--; sgx_nr_free_pages--;
/* LOAD sgx_nr_free_pages = 100 */ /* LOAD sgx_nr_free_pages = 100 */
/* sgx_nr_free_pages-- */ /* sgx_nr_free_pages-- */
/* STORE sgx_nr_free_pages = 99 */ /* STORE sgx_nr_free_pages = 99 */
spin_unlock(&nodeA->lock); spin_unlock(&nodeB->lock);
In the above scenario, sgx_nr_free_pages is decremented from two CPUs
but instead of sgx_nr_free_pages ending with a value that is two less
than it started with, it was only decremented by one while the number
of free pages were actually reduced by two. The consequence of
sgx_nr_free_pages not being protected is that its value may not
accurately reflect the actual number of free pages on the system,
impacting the availability of free pages in support of many flows.
The problematic scenario is when the reclaimer does not run because it
believes there to be sufficient free pages while any attempt to allocate
a page fails because there are no free pages available. In the SGX driver
the reclaimer's watermark is only 32 pages so after encountering the
above example scenario 32 times a user space hang is possible when there
are no more free pages because of repeated page faults caused by no
free pages made available.
The following flow was encountered:
asm_exc_page_fault
...
sgx_vma_fault()
sgx_encl_load_page()
sgx_encl_eldu() // Encrypted page needs to be loaded from backing
// storage into newly allocated SGX memory page
sgx_alloc_epc_page() // Allocate a page of SGX memory
__sgx_alloc_epc_page() // Fails, no free SGX memory
...
if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) // Wake reclaimer
wake_up(&ksgxd_waitq);
return -EBUSY; // Return -EBUSY giving reclaimer time to run
return -EBUSY;
return -EBUSY;
return VM_FAULT_NOPAGE;
The reclaimer is triggered in above flow with the following code:
static bool sgx_should_reclaim(unsigned long watermark)
{
return sgx_nr_free_pages < watermark &&
!list_empty(&sgx_active_page_list);
}
In the problematic scenario there were no free pages available yet the
value of sgx_nr_free_pages was above the watermark. The allocation of
SGX memory thus always failed because of a lack of free pages while no
free pages were made available because the reclaimer is never started
because of sgx_nr_free_pages' incorrect value. The consequence was that
user space kept encountering VM_FAULT_NOPAGE that caused the same
address to be accessed repeatedly with the same result.
Change the global free page counter to an atomic type that
ensures simultaneous updates are done safely. While doing so, move
the updating of the variable outside of the spin lock critical
section to which it does not belong.
Cc: stable@vger.kernel.org
Fixes: 901ddbb9ecf5 ("x86/sgx: Add a basic NUMA allocation scheme to sgx_alloc_epc_page()")
Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lkml.kernel.org/r/a95a40743bbd3f795b465f30922dde7f1ea9e0eb.1637004094.git.reinette.chatre@intel.com
2021-11-16 03:29:04 +08:00
|
|
|
return atomic_long_read(&sgx_nr_free_pages) < watermark &&
|
|
|
|
!list_empty(&sgx_active_page_list);
|
x86/sgx: Add a page reclaimer
Just like normal RAM, there is a limited amount of enclave memory available
and overcommitting it is a very valuable tool to reduce resource use.
Introduce a simple reclaim mechanism for enclave pages.
In contrast to normal page reclaim, the kernel cannot directly access
enclave memory. To get around this, the SGX architecture provides a set of
functions to help. Among other things, these functions copy enclave memory
to and from normal memory, encrypting it and protecting its integrity in
the process.
Implement a page reclaimer by using these functions. Picks victim pages in
LRU fashion from all the enclaves running in the system. A new kernel
thread (ksgxswapd) reclaims pages in the background based on watermarks,
similar to normal kswapd.
All enclave pages can be reclaimed, architecturally. But, there are some
limits to this, such as the special SECS metadata page which must be
reclaimed last. The page version array (used to mitigate replaying old
reclaimed pages) is also architecturally reclaimable, but not yet
implemented. The end result is that the vast majority of enclave pages are
currently reclaimable.
Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jethro Beekman <jethro@fortanix.com>
Link: https://lkml.kernel.org/r/20201112220135.165028-22-jarkko@kernel.org
2020-11-13 06:01:32 +08:00
|
|
|
}
|
|
|
|
|
2020-11-13 06:01:16 +08:00
|
|
|
static int ksgxd(void *p)
|
|
|
|
{
|
|
|
|
set_freezable();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Sanitize pages in order to recover from kexec(). The 2nd pass is
|
|
|
|
* required for SECS pages, whose child pages blocked EREMOVE.
|
|
|
|
*/
|
2021-03-18 07:53:30 +08:00
|
|
|
__sgx_sanitize_pages(&sgx_dirty_page_list);
|
|
|
|
__sgx_sanitize_pages(&sgx_dirty_page_list);
|
2020-11-13 06:01:16 +08:00
|
|
|
|
2021-03-18 07:53:30 +08:00
|
|
|
/* sanity check: */
|
|
|
|
WARN_ON(!list_empty(&sgx_dirty_page_list));
|
2020-11-13 06:01:16 +08:00
|
|
|
|
x86/sgx: Add a page reclaimer
Just like normal RAM, there is a limited amount of enclave memory available
and overcommitting it is a very valuable tool to reduce resource use.
Introduce a simple reclaim mechanism for enclave pages.
In contrast to normal page reclaim, the kernel cannot directly access
enclave memory. To get around this, the SGX architecture provides a set of
functions to help. Among other things, these functions copy enclave memory
to and from normal memory, encrypting it and protecting its integrity in
the process.
Implement a page reclaimer by using these functions. Picks victim pages in
LRU fashion from all the enclaves running in the system. A new kernel
thread (ksgxswapd) reclaims pages in the background based on watermarks,
similar to normal kswapd.
All enclave pages can be reclaimed, architecturally. But, there are some
limits to this, such as the special SECS metadata page which must be
reclaimed last. The page version array (used to mitigate replaying old
reclaimed pages) is also architecturally reclaimable, but not yet
implemented. The end result is that the vast majority of enclave pages are
currently reclaimable.
Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jethro Beekman <jethro@fortanix.com>
Link: https://lkml.kernel.org/r/20201112220135.165028-22-jarkko@kernel.org
2020-11-13 06:01:32 +08:00
|
|
|
while (!kthread_should_stop()) {
|
|
|
|
if (try_to_freeze())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
wait_event_freezable(ksgxd_waitq,
|
|
|
|
kthread_should_stop() ||
|
|
|
|
sgx_should_reclaim(SGX_NR_HIGH_PAGES));
|
|
|
|
|
|
|
|
if (sgx_should_reclaim(SGX_NR_HIGH_PAGES))
|
|
|
|
sgx_reclaim_pages();
|
|
|
|
|
|
|
|
cond_resched();
|
|
|
|
}
|
|
|
|
|
2020-11-13 06:01:16 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool __init sgx_page_reclaimer_init(void)
|
|
|
|
{
|
|
|
|
struct task_struct *tsk;
|
|
|
|
|
|
|
|
tsk = kthread_run(ksgxd, NULL, "ksgxd");
|
|
|
|
if (IS_ERR(tsk))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
ksgxd_tsk = tsk;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2021-03-18 07:53:31 +08:00
|
|
|
static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
|
2020-11-13 06:01:20 +08:00
|
|
|
{
|
2021-03-18 07:53:31 +08:00
|
|
|
struct sgx_numa_node *node = &sgx_numa_nodes[nid];
|
|
|
|
struct sgx_epc_page *page = NULL;
|
2020-11-13 06:01:20 +08:00
|
|
|
|
2021-03-18 07:53:31 +08:00
|
|
|
spin_lock(&node->lock);
|
2020-11-13 06:01:20 +08:00
|
|
|
|
2021-03-18 07:53:31 +08:00
|
|
|
if (list_empty(&node->free_page_list)) {
|
|
|
|
spin_unlock(&node->lock);
|
2020-11-13 06:01:20 +08:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2021-03-18 07:53:31 +08:00
|
|
|
page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
|
2020-11-13 06:01:20 +08:00
|
|
|
list_del_init(&page->list);
|
2021-10-27 06:00:44 +08:00
|
|
|
page->flags = 0;
|
2021-03-18 07:53:31 +08:00
|
|
|
|
|
|
|
spin_unlock(&node->lock);
|
x86/sgx: Fix free page accounting
The SGX driver maintains a single global free page counter,
sgx_nr_free_pages, that reflects the number of free pages available
across all NUMA nodes. Correspondingly, a list of free pages is
associated with each NUMA node and sgx_nr_free_pages is updated
every time a page is added or removed from any of the free page
lists. The main usage of sgx_nr_free_pages is by the reclaimer
that runs when it (sgx_nr_free_pages) goes below a watermark
to ensure that there are always some free pages available to, for
example, support efficient page faults.
With sgx_nr_free_pages accessed and modified from a few places
it is essential to ensure that these accesses are done safely but
this is not the case. sgx_nr_free_pages is read without any
protection and updated with inconsistent protection by any one
of the spin locks associated with the individual NUMA nodes.
For example:
CPU_A CPU_B
----- -----
spin_lock(&nodeA->lock); spin_lock(&nodeB->lock);
... ...
sgx_nr_free_pages--; /* NOT SAFE */ sgx_nr_free_pages--;
spin_unlock(&nodeA->lock); spin_unlock(&nodeB->lock);
Since sgx_nr_free_pages may be protected by different spin locks
while being modified from different CPUs, the following scenario
is possible:
CPU_A CPU_B
----- -----
{sgx_nr_free_pages = 100}
spin_lock(&nodeA->lock); spin_lock(&nodeB->lock);
sgx_nr_free_pages--; sgx_nr_free_pages--;
/* LOAD sgx_nr_free_pages = 100 */ /* LOAD sgx_nr_free_pages = 100 */
/* sgx_nr_free_pages-- */ /* sgx_nr_free_pages-- */
/* STORE sgx_nr_free_pages = 99 */ /* STORE sgx_nr_free_pages = 99 */
spin_unlock(&nodeA->lock); spin_unlock(&nodeB->lock);
In the above scenario, sgx_nr_free_pages is decremented from two CPUs
but instead of sgx_nr_free_pages ending with a value that is two less
than it started with, it was only decremented by one while the number
of free pages were actually reduced by two. The consequence of
sgx_nr_free_pages not being protected is that its value may not
accurately reflect the actual number of free pages on the system,
impacting the availability of free pages in support of many flows.
The problematic scenario is when the reclaimer does not run because it
believes there to be sufficient free pages while any attempt to allocate
a page fails because there are no free pages available. In the SGX driver
the reclaimer's watermark is only 32 pages so after encountering the
above example scenario 32 times a user space hang is possible when there
are no more free pages because of repeated page faults caused by no
free pages made available.
The following flow was encountered:
asm_exc_page_fault
...
sgx_vma_fault()
sgx_encl_load_page()
sgx_encl_eldu() // Encrypted page needs to be loaded from backing
// storage into newly allocated SGX memory page
sgx_alloc_epc_page() // Allocate a page of SGX memory
__sgx_alloc_epc_page() // Fails, no free SGX memory
...
if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) // Wake reclaimer
wake_up(&ksgxd_waitq);
return -EBUSY; // Return -EBUSY giving reclaimer time to run
return -EBUSY;
return -EBUSY;
return VM_FAULT_NOPAGE;
The reclaimer is triggered in above flow with the following code:
static bool sgx_should_reclaim(unsigned long watermark)
{
return sgx_nr_free_pages < watermark &&
!list_empty(&sgx_active_page_list);
}
In the problematic scenario there were no free pages available yet the
value of sgx_nr_free_pages was above the watermark. The allocation of
SGX memory thus always failed because of a lack of free pages while no
free pages were made available because the reclaimer is never started
because of sgx_nr_free_pages' incorrect value. The consequence was that
user space kept encountering VM_FAULT_NOPAGE that caused the same
address to be accessed repeatedly with the same result.
Change the global free page counter to an atomic type that
ensures simultaneous updates are done safely. While doing so, move
the updating of the variable outside of the spin lock critical
section to which it does not belong.
Cc: stable@vger.kernel.org
Fixes: 901ddbb9ecf5 ("x86/sgx: Add a basic NUMA allocation scheme to sgx_alloc_epc_page()")
Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lkml.kernel.org/r/a95a40743bbd3f795b465f30922dde7f1ea9e0eb.1637004094.git.reinette.chatre@intel.com
2021-11-16 03:29:04 +08:00
|
|
|
atomic_long_dec(&sgx_nr_free_pages);
|
2020-11-13 06:01:20 +08:00
|
|
|
|
|
|
|
return page;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* __sgx_alloc_epc_page() - Allocate an EPC page
|
|
|
|
*
|
2021-03-18 07:53:31 +08:00
|
|
|
* Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start
|
|
|
|
* from the NUMA node, where the caller is executing.
|
2020-11-13 06:01:20 +08:00
|
|
|
*
|
|
|
|
* Return:
|
2021-03-18 07:53:31 +08:00
|
|
|
* - an EPC page: A borrowed EPC pages were available.
|
|
|
|
* - NULL: Out of EPC pages.
|
2020-11-13 06:01:20 +08:00
|
|
|
*/
|
|
|
|
struct sgx_epc_page *__sgx_alloc_epc_page(void)
|
|
|
|
{
|
|
|
|
struct sgx_epc_page *page;
|
2021-03-18 07:53:31 +08:00
|
|
|
int nid_of_current = numa_node_id();
|
|
|
|
int nid = nid_of_current;
|
2020-11-13 06:01:20 +08:00
|
|
|
|
2021-03-18 07:53:31 +08:00
|
|
|
if (node_isset(nid_of_current, sgx_numa_mask)) {
|
|
|
|
page = __sgx_alloc_epc_page_from_node(nid_of_current);
|
|
|
|
if (page)
|
|
|
|
return page;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Fall back to the non-local NUMA nodes: */
|
|
|
|
while (true) {
|
|
|
|
nid = next_node_in(nid, sgx_numa_mask);
|
|
|
|
if (nid == nid_of_current)
|
|
|
|
break;
|
2020-11-13 06:01:20 +08:00
|
|
|
|
2021-03-18 07:53:31 +08:00
|
|
|
page = __sgx_alloc_epc_page_from_node(nid);
|
2020-11-13 06:01:20 +08:00
|
|
|
if (page)
|
|
|
|
return page;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
}
|
|
|
|
|
x86/sgx: Add a page reclaimer
Just like normal RAM, there is a limited amount of enclave memory available
and overcommitting it is a very valuable tool to reduce resource use.
Introduce a simple reclaim mechanism for enclave pages.
In contrast to normal page reclaim, the kernel cannot directly access
enclave memory. To get around this, the SGX architecture provides a set of
functions to help. Among other things, these functions copy enclave memory
to and from normal memory, encrypting it and protecting its integrity in
the process.
Implement a page reclaimer by using these functions. Picks victim pages in
LRU fashion from all the enclaves running in the system. A new kernel
thread (ksgxswapd) reclaims pages in the background based on watermarks,
similar to normal kswapd.
All enclave pages can be reclaimed, architecturally. But, there are some
limits to this, such as the special SECS metadata page which must be
reclaimed last. The page version array (used to mitigate replaying old
reclaimed pages) is also architecturally reclaimable, but not yet
implemented. The end result is that the vast majority of enclave pages are
currently reclaimable.
Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jethro Beekman <jethro@fortanix.com>
Link: https://lkml.kernel.org/r/20201112220135.165028-22-jarkko@kernel.org
2020-11-13 06:01:32 +08:00
|
|
|
/**
|
|
|
|
* sgx_mark_page_reclaimable() - Mark a page as reclaimable
|
|
|
|
* @page: EPC page
|
|
|
|
*
|
|
|
|
* Mark a page as reclaimable and add it to the active page list. Pages
|
|
|
|
* are automatically removed from the active list when freed.
|
|
|
|
*/
|
|
|
|
void sgx_mark_page_reclaimable(struct sgx_epc_page *page)
|
|
|
|
{
|
|
|
|
spin_lock(&sgx_reclaimer_lock);
|
|
|
|
page->flags |= SGX_EPC_PAGE_RECLAIMER_TRACKED;
|
|
|
|
list_add_tail(&page->list, &sgx_active_page_list);
|
|
|
|
spin_unlock(&sgx_reclaimer_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* sgx_unmark_page_reclaimable() - Remove a page from the reclaim list
|
|
|
|
* @page: EPC page
|
|
|
|
*
|
|
|
|
* Clear the reclaimable flag and remove the page from the active page list.
|
|
|
|
*
|
|
|
|
* Return:
|
|
|
|
* 0 on success,
|
|
|
|
* -EBUSY if the page is in the process of being reclaimed
|
|
|
|
*/
|
|
|
|
int sgx_unmark_page_reclaimable(struct sgx_epc_page *page)
|
|
|
|
{
|
|
|
|
spin_lock(&sgx_reclaimer_lock);
|
|
|
|
if (page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED) {
|
|
|
|
/* The page is being reclaimed. */
|
|
|
|
if (list_empty(&page->list)) {
|
|
|
|
spin_unlock(&sgx_reclaimer_lock);
|
|
|
|
return -EBUSY;
|
|
|
|
}
|
|
|
|
|
|
|
|
list_del(&page->list);
|
|
|
|
page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
|
|
|
|
}
|
|
|
|
spin_unlock(&sgx_reclaimer_lock);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* sgx_alloc_epc_page() - Allocate an EPC page
|
|
|
|
* @owner: the owner of the EPC page
|
|
|
|
* @reclaim: reclaim pages if necessary
|
|
|
|
*
|
|
|
|
* Iterate through EPC sections and borrow a free EPC page to the caller. When a
|
|
|
|
* page is no longer needed it must be released with sgx_free_epc_page(). If
|
|
|
|
* @reclaim is set to true, directly reclaim pages when we are out of pages. No
|
|
|
|
* mm's can be locked when @reclaim is set to true.
|
|
|
|
*
|
|
|
|
* Finally, wake up ksgxd when the number of pages goes below the watermark
|
|
|
|
* before returning back to the caller.
|
|
|
|
*
|
|
|
|
* Return:
|
|
|
|
* an EPC page,
|
|
|
|
* -errno on error
|
|
|
|
*/
|
|
|
|
struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
|
|
|
|
{
|
|
|
|
struct sgx_epc_page *page;
|
|
|
|
|
|
|
|
for ( ; ; ) {
|
|
|
|
page = __sgx_alloc_epc_page();
|
|
|
|
if (!IS_ERR(page)) {
|
|
|
|
page->owner = owner;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (list_empty(&sgx_active_page_list))
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
if (!reclaim) {
|
|
|
|
page = ERR_PTR(-EBUSY);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (signal_pending(current)) {
|
|
|
|
page = ERR_PTR(-ERESTARTSYS);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
sgx_reclaim_pages();
|
|
|
|
cond_resched();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
|
|
|
|
wake_up(&ksgxd_waitq);
|
|
|
|
|
|
|
|
return page;
|
|
|
|
}
|
|
|
|
|
2020-11-13 06:01:20 +08:00
|
|
|
/**
|
|
|
|
* sgx_free_epc_page() - Free an EPC page
|
|
|
|
* @page: an EPC page
|
|
|
|
*
|
x86/sgx: Wipe out EREMOVE from sgx_free_epc_page()
EREMOVE takes a page and removes any association between that page and
an enclave. It must be run on a page before it can be added into another
enclave. Currently, EREMOVE is run as part of pages being freed into the
SGX page allocator. It is not expected to fail, as it would indicate a
use-after-free of EPC pages. Rather than add the page back to the pool
of available EPC pages, the kernel intentionally leaks the page to avoid
additional errors in the future.
However, KVM does not track how guest pages are used, which means that
SGX virtualization use of EREMOVE might fail. Specifically, it is
legitimate that EREMOVE returns SGX_CHILD_PRESENT for EPC assigned to
KVM guest, because KVM/kernel doesn't track SECS pages.
To allow SGX/KVM to introduce a more permissive EREMOVE helper and
to let the SGX virtualization code use the allocator directly, break
out the EREMOVE call from the SGX page allocator. Rename the original
sgx_free_epc_page() to sgx_encl_free_epc_page(), indicating that
it is used to free an EPC page assigned to a host enclave. Replace
sgx_free_epc_page() with sgx_encl_free_epc_page() in all call sites so
there's no functional change.
At the same time, improve the error message when EREMOVE fails, and
add documentation to explain to the user what that failure means and
to suggest to the user what to do when this bug happens in the case it
happens.
[ bp: Massage commit message, fix typos and sanitize text, simplify. ]
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lkml.kernel.org/r/20210325093057.122834-1-kai.huang@intel.com
2021-03-25 17:30:57 +08:00
|
|
|
* Put the EPC page back to the list of free pages. It's the caller's
|
|
|
|
* responsibility to make sure that the page is in uninitialized state. In other
|
|
|
|
* words, do EREMOVE, EWB or whatever operation is necessary before calling
|
|
|
|
* this function.
|
2020-11-13 06:01:20 +08:00
|
|
|
*/
|
|
|
|
void sgx_free_epc_page(struct sgx_epc_page *page)
|
|
|
|
{
|
|
|
|
struct sgx_epc_section *section = &sgx_epc_sections[page->section];
|
2021-03-18 07:53:31 +08:00
|
|
|
struct sgx_numa_node *node = section->node;
|
2020-11-13 06:01:20 +08:00
|
|
|
|
2021-03-18 07:53:31 +08:00
|
|
|
spin_lock(&node->lock);
|
|
|
|
|
2021-10-27 06:00:46 +08:00
|
|
|
page->owner = NULL;
|
|
|
|
if (page->poison)
|
|
|
|
list_add(&page->list, &node->sgx_poison_page_list);
|
|
|
|
else
|
|
|
|
list_add_tail(&page->list, &node->free_page_list);
|
2021-10-27 06:00:44 +08:00
|
|
|
page->flags = SGX_EPC_PAGE_IS_FREE;
|
2021-03-18 07:53:31 +08:00
|
|
|
|
|
|
|
spin_unlock(&node->lock);
|
x86/sgx: Fix free page accounting
The SGX driver maintains a single global free page counter,
sgx_nr_free_pages, that reflects the number of free pages available
across all NUMA nodes. Correspondingly, a list of free pages is
associated with each NUMA node and sgx_nr_free_pages is updated
every time a page is added or removed from any of the free page
lists. The main usage of sgx_nr_free_pages is by the reclaimer
that runs when it (sgx_nr_free_pages) goes below a watermark
to ensure that there are always some free pages available to, for
example, support efficient page faults.
With sgx_nr_free_pages accessed and modified from a few places
it is essential to ensure that these accesses are done safely but
this is not the case. sgx_nr_free_pages is read without any
protection and updated with inconsistent protection by any one
of the spin locks associated with the individual NUMA nodes.
For example:
CPU_A CPU_B
----- -----
spin_lock(&nodeA->lock); spin_lock(&nodeB->lock);
... ...
sgx_nr_free_pages--; /* NOT SAFE */ sgx_nr_free_pages--;
spin_unlock(&nodeA->lock); spin_unlock(&nodeB->lock);
Since sgx_nr_free_pages may be protected by different spin locks
while being modified from different CPUs, the following scenario
is possible:
CPU_A CPU_B
----- -----
{sgx_nr_free_pages = 100}
spin_lock(&nodeA->lock); spin_lock(&nodeB->lock);
sgx_nr_free_pages--; sgx_nr_free_pages--;
/* LOAD sgx_nr_free_pages = 100 */ /* LOAD sgx_nr_free_pages = 100 */
/* sgx_nr_free_pages-- */ /* sgx_nr_free_pages-- */
/* STORE sgx_nr_free_pages = 99 */ /* STORE sgx_nr_free_pages = 99 */
spin_unlock(&nodeA->lock); spin_unlock(&nodeB->lock);
In the above scenario, sgx_nr_free_pages is decremented from two CPUs
but instead of sgx_nr_free_pages ending with a value that is two less
than it started with, it was only decremented by one while the number
of free pages were actually reduced by two. The consequence of
sgx_nr_free_pages not being protected is that its value may not
accurately reflect the actual number of free pages on the system,
impacting the availability of free pages in support of many flows.
The problematic scenario is when the reclaimer does not run because it
believes there to be sufficient free pages while any attempt to allocate
a page fails because there are no free pages available. In the SGX driver
the reclaimer's watermark is only 32 pages so after encountering the
above example scenario 32 times a user space hang is possible when there
are no more free pages because of repeated page faults caused by no
free pages made available.
The following flow was encountered:
asm_exc_page_fault
...
sgx_vma_fault()
sgx_encl_load_page()
sgx_encl_eldu() // Encrypted page needs to be loaded from backing
// storage into newly allocated SGX memory page
sgx_alloc_epc_page() // Allocate a page of SGX memory
__sgx_alloc_epc_page() // Fails, no free SGX memory
...
if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) // Wake reclaimer
wake_up(&ksgxd_waitq);
return -EBUSY; // Return -EBUSY giving reclaimer time to run
return -EBUSY;
return -EBUSY;
return VM_FAULT_NOPAGE;
The reclaimer is triggered in above flow with the following code:
static bool sgx_should_reclaim(unsigned long watermark)
{
return sgx_nr_free_pages < watermark &&
!list_empty(&sgx_active_page_list);
}
In the problematic scenario there were no free pages available yet the
value of sgx_nr_free_pages was above the watermark. The allocation of
SGX memory thus always failed because of a lack of free pages while no
free pages were made available because the reclaimer is never started
because of sgx_nr_free_pages' incorrect value. The consequence was that
user space kept encountering VM_FAULT_NOPAGE that caused the same
address to be accessed repeatedly with the same result.
Change the global free page counter to an atomic type that
ensures simultaneous updates are done safely. While doing so, move
the updating of the variable outside of the spin lock critical
section to which it does not belong.
Cc: stable@vger.kernel.org
Fixes: 901ddbb9ecf5 ("x86/sgx: Add a basic NUMA allocation scheme to sgx_alloc_epc_page()")
Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lkml.kernel.org/r/a95a40743bbd3f795b465f30922dde7f1ea9e0eb.1637004094.git.reinette.chatre@intel.com
2021-11-16 03:29:04 +08:00
|
|
|
atomic_long_inc(&sgx_nr_free_pages);
|
2020-11-13 06:01:20 +08:00
|
|
|
}
|
|
|
|
|
2020-11-13 06:01:16 +08:00
|
|
|
static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
|
|
|
|
unsigned long index,
|
|
|
|
struct sgx_epc_section *section)
|
|
|
|
{
|
|
|
|
unsigned long nr_pages = size >> PAGE_SHIFT;
|
|
|
|
unsigned long i;
|
|
|
|
|
|
|
|
section->virt_addr = memremap(phys_addr, size, MEMREMAP_WB);
|
|
|
|
if (!section->virt_addr)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
section->pages = vmalloc(nr_pages * sizeof(struct sgx_epc_page));
|
|
|
|
if (!section->pages) {
|
|
|
|
memunmap(section->virt_addr);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
section->phys_addr = phys_addr;
|
2021-10-27 06:00:45 +08:00
|
|
|
xa_store_range(&sgx_epc_address_space, section->phys_addr,
|
|
|
|
phys_addr + size - 1, section, GFP_KERNEL);
|
2020-11-13 06:01:16 +08:00
|
|
|
|
|
|
|
for (i = 0; i < nr_pages; i++) {
|
|
|
|
section->pages[i].section = index;
|
x86/sgx: Add a page reclaimer
Just like normal RAM, there is a limited amount of enclave memory available
and overcommitting it is a very valuable tool to reduce resource use.
Introduce a simple reclaim mechanism for enclave pages.
In contrast to normal page reclaim, the kernel cannot directly access
enclave memory. To get around this, the SGX architecture provides a set of
functions to help. Among other things, these functions copy enclave memory
to and from normal memory, encrypting it and protecting its integrity in
the process.
Implement a page reclaimer by using these functions. Picks victim pages in
LRU fashion from all the enclaves running in the system. A new kernel
thread (ksgxswapd) reclaims pages in the background based on watermarks,
similar to normal kswapd.
All enclave pages can be reclaimed, architecturally. But, there are some
limits to this, such as the special SECS metadata page which must be
reclaimed last. The page version array (used to mitigate replaying old
reclaimed pages) is also architecturally reclaimable, but not yet
implemented. The end result is that the vast majority of enclave pages are
currently reclaimable.
Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jethro Beekman <jethro@fortanix.com>
Link: https://lkml.kernel.org/r/20201112220135.165028-22-jarkko@kernel.org
2020-11-13 06:01:32 +08:00
|
|
|
section->pages[i].flags = 0;
|
|
|
|
section->pages[i].owner = NULL;
|
2021-10-27 06:00:46 +08:00
|
|
|
section->pages[i].poison = 0;
|
2021-03-18 07:53:30 +08:00
|
|
|
list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list);
|
2020-11-13 06:01:16 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2021-10-27 06:00:45 +08:00
|
|
|
bool arch_is_platform_page(u64 paddr)
|
|
|
|
{
|
|
|
|
return !!xa_load(&sgx_epc_address_space, paddr);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(arch_is_platform_page);
|
|
|
|
|
2021-10-27 06:00:47 +08:00
|
|
|
static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr)
|
|
|
|
{
|
|
|
|
struct sgx_epc_section *section;
|
|
|
|
|
|
|
|
section = xa_load(&sgx_epc_address_space, paddr);
|
|
|
|
if (!section)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
return §ion->pages[PFN_DOWN(paddr - section->phys_addr)];
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called in process context to handle a hardware reported
|
|
|
|
* error in an SGX EPC page.
|
|
|
|
* If the MF_ACTION_REQUIRED bit is set in flags, then the
|
|
|
|
* context is the task that consumed the poison data. Otherwise
|
|
|
|
* this is called from a kernel thread unrelated to the page.
|
|
|
|
*/
|
|
|
|
int arch_memory_failure(unsigned long pfn, int flags)
|
|
|
|
{
|
|
|
|
struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT);
|
|
|
|
struct sgx_epc_section *section;
|
|
|
|
struct sgx_numa_node *node;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* mm/memory-failure.c calls this routine for all errors
|
|
|
|
* where there isn't a "struct page" for the address. But that
|
|
|
|
* includes other address ranges besides SGX.
|
|
|
|
*/
|
|
|
|
if (!page)
|
|
|
|
return -ENXIO;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If poison was consumed synchronously. Send a SIGBUS to
|
|
|
|
* the task. Hardware has already exited the SGX enclave and
|
|
|
|
* will not allow re-entry to an enclave that has a memory
|
|
|
|
* error. The signal may help the task understand why the
|
|
|
|
* enclave is broken.
|
|
|
|
*/
|
|
|
|
if (flags & MF_ACTION_REQUIRED)
|
|
|
|
force_sig(SIGBUS);
|
|
|
|
|
|
|
|
section = &sgx_epc_sections[page->section];
|
|
|
|
node = section->node;
|
|
|
|
|
|
|
|
spin_lock(&node->lock);
|
|
|
|
|
|
|
|
/* Already poisoned? Nothing more to do */
|
|
|
|
if (page->poison)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
page->poison = 1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the page is on a free list, move it to the per-node
|
|
|
|
* poison page list.
|
|
|
|
*/
|
|
|
|
if (page->flags & SGX_EPC_PAGE_IS_FREE) {
|
|
|
|
list_move(&page->list, &node->sgx_poison_page_list);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* TBD: Add additional plumbing to enable pre-emptive
|
|
|
|
* action for asynchronous poison notification. Until
|
|
|
|
* then just hope that the poison:
|
|
|
|
* a) is not accessed - sgx_free_epc_page() will deal with it
|
|
|
|
* when the user gives it back
|
|
|
|
* b) results in a recoverable machine check rather than
|
|
|
|
* a fatal one
|
|
|
|
*/
|
|
|
|
out:
|
|
|
|
spin_unlock(&node->lock);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-11-13 06:01:16 +08:00
|
|
|
/**
|
|
|
|
* A section metric is concatenated in a way that @low bits 12-31 define the
|
|
|
|
* bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
|
|
|
|
* metric.
|
|
|
|
*/
|
|
|
|
static inline u64 __init sgx_calc_section_metric(u64 low, u64 high)
|
|
|
|
{
|
|
|
|
return (low & GENMASK_ULL(31, 12)) +
|
|
|
|
((high & GENMASK_ULL(19, 0)) << 32);
|
|
|
|
}
|
|
|
|
|
2022-01-05 01:15:27 +08:00
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf)
|
|
|
|
{
|
|
|
|
return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size);
|
|
|
|
}
|
|
|
|
static DEVICE_ATTR_RO(sgx_total_bytes);
|
|
|
|
|
|
|
|
static umode_t arch_node_attr_is_visible(struct kobject *kobj,
|
|
|
|
struct attribute *attr, int idx)
|
|
|
|
{
|
|
|
|
/* Make all x86/ attributes invisible when SGX is not initialized: */
|
|
|
|
if (nodes_empty(sgx_numa_mask))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return attr->mode;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct attribute *arch_node_dev_attrs[] = {
|
|
|
|
&dev_attr_sgx_total_bytes.attr,
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
|
|
|
const struct attribute_group arch_node_dev_group = {
|
|
|
|
.name = "x86",
|
|
|
|
.attrs = arch_node_dev_attrs,
|
|
|
|
.is_visible = arch_node_attr_is_visible,
|
|
|
|
};
|
|
|
|
|
|
|
|
static void __init arch_update_sysfs_visibility(int nid)
|
|
|
|
{
|
|
|
|
struct node *node = node_devices[nid];
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = sysfs_update_group(&node->dev.kobj, &arch_node_dev_group);
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
pr_err("sysfs update failed (%d), files may be invisible", ret);
|
|
|
|
}
|
|
|
|
#else /* !CONFIG_NUMA */
|
|
|
|
static void __init arch_update_sysfs_visibility(int nid) {}
|
|
|
|
#endif
|
|
|
|
|
2020-11-13 06:01:16 +08:00
|
|
|
static bool __init sgx_page_cache_init(void)
|
|
|
|
{
|
|
|
|
u32 eax, ebx, ecx, edx, type;
|
|
|
|
u64 pa, size;
|
2021-03-18 07:53:31 +08:00
|
|
|
int nid;
|
2020-11-13 06:01:16 +08:00
|
|
|
int i;
|
|
|
|
|
2021-03-18 07:53:31 +08:00
|
|
|
sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL);
|
|
|
|
if (!sgx_numa_nodes)
|
|
|
|
return false;
|
|
|
|
|
2020-11-13 06:01:16 +08:00
|
|
|
for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
|
|
|
|
cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx);
|
|
|
|
|
|
|
|
type = eax & SGX_CPUID_EPC_MASK;
|
|
|
|
if (type == SGX_CPUID_EPC_INVALID)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (type != SGX_CPUID_EPC_SECTION) {
|
|
|
|
pr_err_once("Unknown EPC section type: %u\n", type);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
pa = sgx_calc_section_metric(eax, ebx);
|
|
|
|
size = sgx_calc_section_metric(ecx, edx);
|
|
|
|
|
|
|
|
pr_info("EPC section 0x%llx-0x%llx\n", pa, pa + size - 1);
|
|
|
|
|
|
|
|
if (!sgx_setup_epc_section(pa, size, i, &sgx_epc_sections[i])) {
|
|
|
|
pr_err("No free memory for an EPC section\n");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2021-03-18 07:53:31 +08:00
|
|
|
nid = numa_map_to_online_node(phys_to_target_node(pa));
|
|
|
|
if (nid == NUMA_NO_NODE) {
|
|
|
|
/* The physical address is already printed above. */
|
|
|
|
pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n");
|
|
|
|
nid = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!node_isset(nid, sgx_numa_mask)) {
|
|
|
|
spin_lock_init(&sgx_numa_nodes[nid].lock);
|
|
|
|
INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list);
|
2021-10-27 06:00:46 +08:00
|
|
|
INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list);
|
2021-03-18 07:53:31 +08:00
|
|
|
node_set(nid, sgx_numa_mask);
|
x86/sgx: Add an attribute for the amount of SGX memory in a NUMA node
== Problem ==
The amount of SGX memory on a system is determined by the BIOS and it
varies wildly between systems. It can be as small as dozens of MB's
and as large as many GB's on servers. Just like how applications need
to know how much regular RAM is available, enclave builders need to
know how much SGX memory an enclave can consume.
== Solution ==
Introduce a new sysfs file:
/sys/devices/system/node/nodeX/x86/sgx_total_bytes
to enumerate the amount of SGX memory available in each NUMA node.
This serves the same function for SGX as /proc/meminfo or
/sys/devices/system/node/nodeX/meminfo does for normal RAM.
'sgx_total_bytes' is needed today to help drive the SGX selftests.
SGX-specific swap code is exercised by creating overcommitted enclaves
which are larger than the physical SGX memory on the system. They
currently use a CPUID-based approach which can diverge from the actual
amount of SGX memory available. 'sgx_total_bytes' ensures that the
selftests can work efficiently and do not attempt stupid things like
creating a 100,000 MB enclave on a system with 128 MB of SGX memory.
== Implementation Details ==
Introduce CONFIG_HAVE_ARCH_NODE_DEV_GROUP opt-in flag to expose an
arch specific attribute group, and add an attribute for the amount of
SGX memory in bytes to each NUMA node:
== ABI Design Discussion ==
As opposed to the per-node ABI, a single, global ABI was considered.
However, this would prevent enclaves from being able to size
themselves so that they fit on a single NUMA node. Essentially, a
single value would rule out NUMA optimizations for enclaves.
Create a new "x86/" directory inside each "nodeX/" sysfs directory.
'sgx_total_bytes' is expected to be the first of at least a few
sgx-specific files to be placed in the new directory. Just scanning
/proc/meminfo, these are the no-brainers that we have for RAM, but we
need for SGX:
MemTotal: xxxx kB // sgx_total_bytes (implemented here)
MemFree: yyyy kB // sgx_free_bytes
SwapTotal: zzzz kB // sgx_swapped_bytes
So, at *least* three. I think we will eventually end up needing
something more along the lines of a dozen. A new directory (as
opposed to being in the nodeX/ "root") directory avoids cluttering the
root with several "sgx_*" files.
Place the new file in a new "nodeX/x86/" directory because SGX is
highly x86-specific. It is very unlikely that any other architecture
(or even non-Intel x86 vendor) will ever implement SGX. Using "sgx/"
as opposed to "x86/" was also considered. But, there is a real chance
this can get used for other arch-specific purposes.
[ dhansen: rewrite changelog ]
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211116162116.93081-2-jarkko@kernel.org
2021-11-17 00:21:16 +08:00
|
|
|
sgx_numa_nodes[nid].size = 0;
|
2022-01-05 01:15:27 +08:00
|
|
|
|
|
|
|
/* Make SGX-specific node sysfs files visible: */
|
|
|
|
arch_update_sysfs_visibility(nid);
|
2021-03-18 07:53:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
sgx_epc_sections[i].node = &sgx_numa_nodes[nid];
|
x86/sgx: Add an attribute for the amount of SGX memory in a NUMA node
== Problem ==
The amount of SGX memory on a system is determined by the BIOS and it
varies wildly between systems. It can be as small as dozens of MB's
and as large as many GB's on servers. Just like how applications need
to know how much regular RAM is available, enclave builders need to
know how much SGX memory an enclave can consume.
== Solution ==
Introduce a new sysfs file:
/sys/devices/system/node/nodeX/x86/sgx_total_bytes
to enumerate the amount of SGX memory available in each NUMA node.
This serves the same function for SGX as /proc/meminfo or
/sys/devices/system/node/nodeX/meminfo does for normal RAM.
'sgx_total_bytes' is needed today to help drive the SGX selftests.
SGX-specific swap code is exercised by creating overcommitted enclaves
which are larger than the physical SGX memory on the system. They
currently use a CPUID-based approach which can diverge from the actual
amount of SGX memory available. 'sgx_total_bytes' ensures that the
selftests can work efficiently and do not attempt stupid things like
creating a 100,000 MB enclave on a system with 128 MB of SGX memory.
== Implementation Details ==
Introduce CONFIG_HAVE_ARCH_NODE_DEV_GROUP opt-in flag to expose an
arch specific attribute group, and add an attribute for the amount of
SGX memory in bytes to each NUMA node:
== ABI Design Discussion ==
As opposed to the per-node ABI, a single, global ABI was considered.
However, this would prevent enclaves from being able to size
themselves so that they fit on a single NUMA node. Essentially, a
single value would rule out NUMA optimizations for enclaves.
Create a new "x86/" directory inside each "nodeX/" sysfs directory.
'sgx_total_bytes' is expected to be the first of at least a few
sgx-specific files to be placed in the new directory. Just scanning
/proc/meminfo, these are the no-brainers that we have for RAM, but we
need for SGX:
MemTotal: xxxx kB // sgx_total_bytes (implemented here)
MemFree: yyyy kB // sgx_free_bytes
SwapTotal: zzzz kB // sgx_swapped_bytes
So, at *least* three. I think we will eventually end up needing
something more along the lines of a dozen. A new directory (as
opposed to being in the nodeX/ "root") directory avoids cluttering the
root with several "sgx_*" files.
Place the new file in a new "nodeX/x86/" directory because SGX is
highly x86-specific. It is very unlikely that any other architecture
(or even non-Intel x86 vendor) will ever implement SGX. Using "sgx/"
as opposed to "x86/" was also considered. But, there is a real chance
this can get used for other arch-specific purposes.
[ dhansen: rewrite changelog ]
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211116162116.93081-2-jarkko@kernel.org
2021-11-17 00:21:16 +08:00
|
|
|
sgx_numa_nodes[nid].size += size;
|
2021-03-18 07:53:31 +08:00
|
|
|
|
2020-11-13 06:01:16 +08:00
|
|
|
sgx_nr_epc_sections++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!sgx_nr_epc_sections) {
|
|
|
|
pr_err("There are zero EPC sections.\n");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2021-03-19 15:23:07 +08:00
|
|
|
/*
|
|
|
|
* Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller.
|
|
|
|
* Bare-metal driver requires to update them to hash of enclave's signer
|
|
|
|
* before EINIT. KVM needs to update them to guest's virtual MSR values
|
|
|
|
* before doing EINIT from guest.
|
|
|
|
*/
|
|
|
|
void sgx_update_lepubkeyhash(u64 *lepubkeyhash)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
WARN_ON_ONCE(preemptible());
|
|
|
|
|
|
|
|
for (i = 0; i < 4; i++)
|
|
|
|
wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
|
|
|
|
}
|
|
|
|
|
2021-03-19 15:23:09 +08:00
|
|
|
const struct file_operations sgx_provision_fops = {
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct miscdevice sgx_dev_provision = {
|
|
|
|
.minor = MISC_DYNAMIC_MINOR,
|
|
|
|
.name = "sgx_provision",
|
|
|
|
.nodename = "sgx_provision",
|
|
|
|
.fops = &sgx_provision_fops,
|
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* sgx_set_attribute() - Update allowed attributes given file descriptor
|
|
|
|
* @allowed_attributes: Pointer to allowed enclave attributes
|
|
|
|
* @attribute_fd: File descriptor for specific attribute
|
|
|
|
*
|
|
|
|
* Append enclave attribute indicated by file descriptor to allowed
|
|
|
|
* attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by
|
|
|
|
* /dev/sgx_provision is supported.
|
|
|
|
*
|
|
|
|
* Return:
|
|
|
|
* -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes
|
|
|
|
* -EINVAL: Invalid, or not supported file descriptor
|
|
|
|
*/
|
|
|
|
int sgx_set_attribute(unsigned long *allowed_attributes,
|
|
|
|
unsigned int attribute_fd)
|
|
|
|
{
|
|
|
|
struct file *file;
|
|
|
|
|
|
|
|
file = fget(attribute_fd);
|
|
|
|
if (!file)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (file->f_op != &sgx_provision_fops) {
|
|
|
|
fput(file);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
*allowed_attributes |= SGX_ATTR_PROVISIONKEY;
|
|
|
|
|
|
|
|
fput(file);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(sgx_set_attribute);
|
|
|
|
|
2021-01-14 07:23:11 +08:00
|
|
|
static int __init sgx_init(void)
|
2020-11-13 06:01:16 +08:00
|
|
|
{
|
2020-11-13 06:01:22 +08:00
|
|
|
int ret;
|
2020-11-13 06:01:16 +08:00
|
|
|
int i;
|
|
|
|
|
2020-11-13 06:01:22 +08:00
|
|
|
if (!cpu_feature_enabled(X86_FEATURE_SGX))
|
2021-01-14 07:23:11 +08:00
|
|
|
return -ENODEV;
|
2020-11-13 06:01:16 +08:00
|
|
|
|
|
|
|
if (!sgx_page_cache_init())
|
2021-01-14 07:23:11 +08:00
|
|
|
return -ENOMEM;
|
2020-11-13 06:01:16 +08:00
|
|
|
|
2021-01-14 07:23:11 +08:00
|
|
|
if (!sgx_page_reclaimer_init()) {
|
|
|
|
ret = -ENOMEM;
|
2020-11-13 06:01:16 +08:00
|
|
|
goto err_page_cache;
|
2021-01-14 07:23:11 +08:00
|
|
|
}
|
2020-11-13 06:01:16 +08:00
|
|
|
|
2021-03-19 15:23:09 +08:00
|
|
|
ret = misc_register(&sgx_dev_provision);
|
|
|
|
if (ret)
|
|
|
|
goto err_kthread;
|
|
|
|
|
2021-03-19 15:23:02 +08:00
|
|
|
/*
|
|
|
|
* Always try to initialize the native *and* KVM drivers.
|
|
|
|
* The KVM driver is less picky than the native one and
|
|
|
|
* can function if the native one is not supported on the
|
|
|
|
* current system or fails to initialize.
|
|
|
|
*
|
|
|
|
* Error out only if both fail to initialize.
|
|
|
|
*/
|
2020-11-13 06:01:22 +08:00
|
|
|
ret = sgx_drv_init();
|
2021-03-19 15:23:02 +08:00
|
|
|
|
|
|
|
if (sgx_vepc_init() && ret)
|
2021-03-19 15:23:09 +08:00
|
|
|
goto err_provision;
|
2020-11-13 06:01:22 +08:00
|
|
|
|
2021-01-14 07:23:11 +08:00
|
|
|
return 0;
|
2020-11-13 06:01:16 +08:00
|
|
|
|
2021-03-19 15:23:09 +08:00
|
|
|
err_provision:
|
|
|
|
misc_deregister(&sgx_dev_provision);
|
|
|
|
|
2020-11-13 06:01:22 +08:00
|
|
|
err_kthread:
|
|
|
|
kthread_stop(ksgxd_tsk);
|
|
|
|
|
2020-11-13 06:01:16 +08:00
|
|
|
err_page_cache:
|
|
|
|
for (i = 0; i < sgx_nr_epc_sections; i++) {
|
|
|
|
vfree(sgx_epc_sections[i].pages);
|
|
|
|
memunmap(sgx_epc_sections[i].virt_addr);
|
|
|
|
}
|
2021-01-14 07:23:11 +08:00
|
|
|
|
|
|
|
return ret;
|
2020-11-13 06:01:16 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
device_initcall(sgx_init);
|