drm/amdkfd: CRIU resume shared virtual memory ranges

In CRIU resume stage, resume all the shared virtual memory ranges from
the data stored inside the resuming kfd process during CRIU restore
phase. Also setup xnack mode and free up the resources.

KFD_IOCTL_SVM_ATTR_CLR_FLAGS is not available for querying via get_attr
interface but we must clear the flags during restore as there might be
some default flags set when the prange is created. Also handle the
invalid PREFETCH atribute values saved during checkpoint by replacing
them with another dummy KFD_IOCTL_SVM_ATTR_SET_FLAGS attribute.

(rajneesh: Fixed the checkpatch reported problems)
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Rajneesh Bhardwaj 2021-11-08 17:33:42 -05:00 committed by Alex Deucher
parent c2db32ce77
commit 2a909ae718
3 changed files with 119 additions and 0 deletions

View File

@ -2766,7 +2766,17 @@ static int criu_resume(struct file *filep,
}
mutex_lock(&target->mutex);
ret = kfd_criu_resume_svm(target);
if (ret) {
pr_err("kfd_criu_resume_svm failed for %i\n", args->pid);
goto exit;
}
ret = amdgpu_amdkfd_criu_resume(target->kgd_process_info);
if (ret)
pr_err("amdgpu_amdkfd_criu_resume failed for %i\n", args->pid);
exit:
mutex_unlock(&target->mutex);
kfd_unref_process(target);

View File

@ -3487,6 +3487,109 @@ fill_values:
return 0;
}
int kfd_criu_resume_svm(struct kfd_process *p)
{
struct kfd_ioctl_svm_attribute *set_attr_new, *set_attr = NULL;
int nattr_common = 4, nattr_accessibility = 1;
struct criu_svm_metadata *criu_svm_md = NULL;
struct svm_range_list *svms = &p->svms;
struct criu_svm_metadata *next = NULL;
uint32_t set_flags = 0xffffffff;
int i, j, num_attrs, ret = 0;
uint64_t set_attr_size;
struct mm_struct *mm;
if (list_empty(&svms->criu_svm_metadata_list)) {
pr_debug("No SVM data from CRIU restore stage 2\n");
return ret;
}
mm = get_task_mm(p->lead_thread);
if (!mm) {
pr_err("failed to get mm for the target process\n");
return -ESRCH;
}
num_attrs = nattr_common + (nattr_accessibility * p->n_pdds);
i = j = 0;
list_for_each_entry(criu_svm_md, &svms->criu_svm_metadata_list, list) {
pr_debug("criu_svm_md[%d]\n\tstart: 0x%llx size: 0x%llx (npages)\n",
i, criu_svm_md->data.start_addr, criu_svm_md->data.size);
for (j = 0; j < num_attrs; j++) {
pr_debug("\ncriu_svm_md[%d]->attrs[%d].type : 0x%x \ncriu_svm_md[%d]->attrs[%d].value : 0x%x\n",
i, j, criu_svm_md->data.attrs[j].type,
i, j, criu_svm_md->data.attrs[j].value);
switch (criu_svm_md->data.attrs[j].type) {
/* During Checkpoint operation, the query for
* KFD_IOCTL_SVM_ATTR_PREFETCH_LOC attribute might
* return KFD_IOCTL_SVM_LOCATION_UNDEFINED if they were
* not used by the range which was checkpointed. Care
* must be taken to not restore with an invalid value
* otherwise the gpuidx value will be invalid and
* set_attr would eventually fail so just replace those
* with another dummy attribute such as
* KFD_IOCTL_SVM_ATTR_SET_FLAGS.
*/
case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
if (criu_svm_md->data.attrs[j].value ==
KFD_IOCTL_SVM_LOCATION_UNDEFINED) {
criu_svm_md->data.attrs[j].type =
KFD_IOCTL_SVM_ATTR_SET_FLAGS;
criu_svm_md->data.attrs[j].value = 0;
}
break;
case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
set_flags = criu_svm_md->data.attrs[j].value;
break;
default:
break;
}
}
/* CLR_FLAGS is not available via get_attr during checkpoint but
* it needs to be inserted before restoring the ranges so
* allocate extra space for it before calling set_attr
*/
set_attr_size = sizeof(struct kfd_ioctl_svm_attribute) *
(num_attrs + 1);
set_attr_new = krealloc(set_attr, set_attr_size,
GFP_KERNEL);
if (!set_attr_new) {
ret = -ENOMEM;
goto exit;
}
set_attr = set_attr_new;
memcpy(set_attr, criu_svm_md->data.attrs, num_attrs *
sizeof(struct kfd_ioctl_svm_attribute));
set_attr[num_attrs].type = KFD_IOCTL_SVM_ATTR_CLR_FLAGS;
set_attr[num_attrs].value = ~set_flags;
ret = svm_range_set_attr(p, mm, criu_svm_md->data.start_addr,
criu_svm_md->data.size, num_attrs + 1,
set_attr);
if (ret) {
pr_err("CRIU: failed to set range attributes\n");
goto exit;
}
i++;
}
exit:
kfree(set_attr);
list_for_each_entry_safe(criu_svm_md, next, &svms->criu_svm_metadata_list, list) {
pr_debug("freeing criu_svm_md[]\n\tstart: 0x%llx\n",
criu_svm_md->data.start_addr);
kfree(criu_svm_md);
}
mmput(mm);
return ret;
}
int kfd_criu_restore_svm(struct kfd_process *p,
uint8_t __user *user_priv_ptr,
uint64_t *priv_data_offset,

View File

@ -192,6 +192,7 @@ int kfd_criu_restore_svm(struct kfd_process *p,
uint8_t __user *user_priv_ptr,
uint64_t *priv_data_offset,
uint64_t max_priv_data_size);
int kfd_criu_resume_svm(struct kfd_process *p);
struct kfd_process_device *
svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device *adev);
void svm_range_list_lock_and_flush_work(struct svm_range_list *svms, struct mm_struct *mm);
@ -253,6 +254,11 @@ static inline int kfd_criu_restore_svm(struct kfd_process *p,
return -EINVAL;
}
static inline int kfd_criu_resume_svm(struct kfd_process *p)
{
return 0;
}
#define KFD_IS_SVM_API_SUPPORTED(dev) false
#endif /* IS_ENABLED(CONFIG_HSA_AMD_SVM) */